Decision Tree
Decision Tree¶
Overview¶
Decision trees are algorithms that make decisions by splitting data according to features into a tree structure. They are intuitive and easy to interpret, making them widely used in practice.
1. Basic Concepts of Decision Trees¶
1.1 Tree Structure¶
"""
Decision tree components:
1. Root Node: First split point
2. Internal Node: Intermediate split points
3. Leaf Node: Final prediction value
4. Split: Data splitting based on features
5. Depth: Distance from root to node
Example: Titanic survival prediction
[Gender]
/ \
Male Female
| |
[Age] Survived
/ \
<10 >=10
| |
Survived Died
"""
1.2 Basic Usage¶
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree import plot_tree, export_text
from sklearn.datasets import load_iris, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Load data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, test_size=0.2, random_state=42
)
# Create and train model
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
# Predict
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
# Print tree structure
print("\nTree Structure:")
print(export_text(clf, feature_names=iris.feature_names))
1.3 Tree Visualization¶
# Visualization
plt.figure(figsize=(20, 10))
plot_tree(
clf,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True,
rounded=True,
fontsize=10
)
plt.title('Decision Tree - Iris Classification')
plt.tight_layout()
plt.show()
# Feature importance
print("\nFeature Importance:")
for name, importance in zip(iris.feature_names, clf.feature_importances_):
print(f" {name}: {importance:.4f}")
2. Split Criteria¶
2.1 Entropy¶
import numpy as np
def entropy(y):
"""Calculate information entropy"""
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
return -np.sum(probabilities * np.log2(probabilities + 1e-10))
# Examples
y_pure = [0, 0, 0, 0, 0] # Pure node
y_mixed = [0, 0, 1, 1, 1] # Mixed node
y_balanced = [0, 0, 1, 1] # Balanced node
print("Entropy examples:")
print(f" Pure node: {entropy(y_pure):.4f}") # 0
print(f" Mixed node [2:3]: {entropy(y_mixed):.4f}")
print(f" Balanced node [2:2]: {entropy(y_balanced):.4f}") # 1 (maximum)
2.2 Gini Impurity¶
def gini_impurity(y):
"""Calculate Gini impurity"""
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
return 1 - np.sum(probabilities ** 2)
print("\nGini impurity examples:")
print(f" Pure node: {gini_impurity(y_pure):.4f}") # 0
print(f" Mixed node: {gini_impurity(y_mixed):.4f}")
print(f" Balanced node: {gini_impurity(y_balanced):.4f}") # 0.5 (maximum)
# Comparison: Entropy vs Gini
"""
- Gini: Faster computation, default
- Entropy: Tends toward more balanced trees
- In practice, little difference
"""
2.3 Information Gain¶
def information_gain(parent, left_child, right_child, criterion='gini'):
"""Calculate information gain"""
if criterion == 'gini':
impurity_func = gini_impurity
else:
impurity_func = entropy
# Weighted average impurity
n = len(left_child) + len(right_child)
n_left, n_right = len(left_child), len(right_child)
weighted_impurity = (n_left / n) * impurity_func(left_child) + \
(n_right / n) * impurity_func(right_child)
return impurity_func(parent) - weighted_impurity
# Example: Compare splits
parent = [0, 0, 0, 1, 1, 1]
# Split A: Good split
left_a = [0, 0, 0]
right_a = [1, 1, 1]
# Split B: Bad split
left_b = [0, 0, 1]
right_b = [0, 1, 1]
print("\nInformation gain comparison:")
print(f" Split A (perfect): {information_gain(parent, left_a, right_a):.4f}")
print(f" Split B (mixed): {information_gain(parent, left_b, right_b):.4f}")
3. CART Algorithm¶
3.1 Classification Tree¶
from sklearn.tree import DecisionTreeClassifier
# Compare different criteria
criteria = ['gini', 'entropy', 'log_loss']
print("Classification Tree - Criterion Comparison:")
for criterion in criteria:
clf = DecisionTreeClassifier(criterion=criterion, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f" {criterion}: Accuracy = {accuracy:.4f}, Depth = {clf.get_depth()}")
3.2 Regression Tree¶
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
# Load data
diabetes = load_diabetes()
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
diabetes.data, diabetes.target, test_size=0.2, random_state=42
)
# Regression tree (MSE criterion)
reg = DecisionTreeRegressor(criterion='squared_error', random_state=42)
reg.fit(X_train_r, y_train_r)
y_pred_r = reg.predict(X_test_r)
print("\nRegression Tree Results:")
print(f" MSE: {mean_squared_error(y_test_r, y_pred_r):.4f}")
print(f" R²: {r2_score(y_test_r, y_pred_r):.4f}")
# Other criteria
criteria_reg = ['squared_error', 'friedman_mse', 'absolute_error']
print("\nRegression Tree - Criterion Comparison:")
for criterion in criteria_reg:
reg = DecisionTreeRegressor(criterion=criterion, random_state=42)
reg.fit(X_train_r, y_train_r)
y_pred = reg.predict(X_test_r)
mse = mean_squared_error(y_test_r, y_pred)
print(f" {criterion}: MSE = {mse:.4f}")
3.3 Split Search Process¶
"""
CART algorithm split process:
1. For all features:
- Consider all possible split points
- Calculate impurity reduction for each split
2. Select optimal split:
- Choose (feature, split point) with maximum impurity reduction
3. Recursive splitting:
- Repeat steps 1-2 for each child node
- Stop when termination condition met
Termination conditions:
- Maximum depth reached
- Node sample count below minimum threshold
- Pure node reached (impurity = 0)
"""
# Simulate split process
def find_best_split(X, y, feature_idx):
"""Find optimal split point for single feature"""
feature = X[:, feature_idx]
sorted_indices = np.argsort(feature)
best_gain = -1
best_threshold = None
for i in range(1, len(feature)):
if feature[sorted_indices[i-1]] == feature[sorted_indices[i]]:
continue
threshold = (feature[sorted_indices[i-1]] + feature[sorted_indices[i]]) / 2
left_mask = feature <= threshold
if np.sum(left_mask) == 0 or np.sum(~left_mask) == 0:
continue
gain = information_gain(y, y[left_mask], y[~left_mask])
if gain > best_gain:
best_gain = gain
best_threshold = threshold
return best_threshold, best_gain
# Test
print("\nOptimal split point search:")
for i, name in enumerate(iris.feature_names):
threshold, gain = find_best_split(iris.data, iris.target, i)
print(f" {name}: threshold={threshold:.2f}, gain={gain:.4f}")
4. Pruning¶
4.1 Pre-pruning¶
# Limit tree growth with hyperparameters
clf_pruned = DecisionTreeClassifier(
max_depth=3, # Maximum depth
min_samples_split=10, # Minimum samples required to split
min_samples_leaf=5, # Minimum samples in leaf node
max_features='sqrt', # Maximum features to consider
max_leaf_nodes=10, # Maximum leaf nodes
random_state=42
)
clf_pruned.fit(X_train, y_train)
print("Pre-pruning results:")
print(f" Depth: {clf_pruned.get_depth()}")
print(f" Leaf nodes: {clf_pruned.get_n_leaves()}")
print(f" Accuracy: {accuracy_score(y_test, clf_pruned.predict(X_test)):.4f}")
4.2 Post-pruning - Cost Complexity Pruning¶
# CCP (Cost Complexity Pruning)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
impurities = path.impurities
print("CCP Alpha path:")
print(f" Number of alpha values: {len(ccp_alphas)}")
# Generate tree for each alpha
clfs = []
for ccp_alpha in ccp_alphas:
clf_ccp = DecisionTreeClassifier(ccp_alpha=ccp_alpha, random_state=42)
clf_ccp.fit(X_train, y_train)
clfs.append(clf_ccp)
# Node count and depth changes
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
n_leaves = [clf.get_n_leaves() for clf in clfs]
depths = [clf.get_depth() for clf in clfs]
# Visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# Alpha vs Accuracy
axes[0].plot(ccp_alphas, train_scores, marker='o', label='Train', drawstyle='steps-post')
axes[0].plot(ccp_alphas, test_scores, marker='o', label='Test', drawstyle='steps-post')
axes[0].set_xlabel('Alpha')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Alpha vs Accuracy')
axes[0].legend()
# Alpha vs Leaf nodes
axes[1].plot(ccp_alphas, n_leaves, marker='o', drawstyle='steps-post')
axes[1].set_xlabel('Alpha')
axes[1].set_ylabel('Number of Leaves')
axes[1].set_title('Alpha vs Number of Leaves')
# Alpha vs Depth
axes[2].plot(ccp_alphas, depths, marker='o', drawstyle='steps-post')
axes[2].set_xlabel('Alpha')
axes[2].set_ylabel('Depth')
axes[2].set_title('Alpha vs Depth')
plt.tight_layout()
plt.show()
# Select optimal alpha (cross-validation)
from sklearn.model_selection import cross_val_score
cv_scores = []
for ccp_alpha in ccp_alphas:
clf_ccp = DecisionTreeClassifier(ccp_alpha=ccp_alpha, random_state=42)
scores = cross_val_score(clf_ccp, X_train, y_train, cv=5)
cv_scores.append(scores.mean())
best_idx = np.argmax(cv_scores)
best_alpha = ccp_alphas[best_idx]
print(f"\nOptimal Alpha: {best_alpha:.6f}")
print(f"Optimal CV score: {cv_scores[best_idx]:.4f}")
5. Decision Boundary Visualization¶
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import numpy as np
# Generate 2D data
X_2d, y_2d = make_classification(
n_samples=200, n_features=2, n_redundant=0,
n_informative=2, n_clusters_per_class=1, random_state=42
)
# Compare trees with different depths
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
depths = [1, 2, 3, 5, 10, None]
for ax, depth in zip(axes.flatten(), depths):
clf = DecisionTreeClassifier(max_depth=depth, random_state=42)
clf.fit(X_2d, y_2d)
# Decision boundary
x_min, x_max = X_2d[:, 0].min() - 0.5, X_2d[:, 0].max() + 0.5
y_min, y_max = X_2d[:, 1].min() - 0.5, X_2d[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
ax.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, edgecolors='black', cmap='RdYlBu')
depth_str = depth if depth else 'None'
ax.set_title(f'Max Depth = {depth_str}\nAccuracy = {clf.score(X_2d, y_2d):.3f}')
plt.tight_layout()
plt.show()
6. Advantages and Disadvantages of Decision Trees¶
6.1 Pros and Cons¶
"""
Advantages:
1. Easy to interpret: Visualize decision-making process
2. Minimal preprocessing: No scaling or normalization needed
3. Nonlinear relationships: Can learn complex nonlinear patterns
4. Various data types: Handle both numerical and categorical
5. Fast prediction: O(log n) time complexity
Disadvantages:
1. Overfitting tendency: Deep trees easily overfit
2. Instability: Sensitive to small data changes
3. Optimization limitations: No global optimum guarantee (greedy)
4. Cannot extrapolate: Difficult to predict outside training range
5. Bias: Sensitive to class imbalance
"""
7. Hyperparameter Tuning¶
from sklearn.model_selection import GridSearchCV
# Hyperparameter grid
param_grid = {
'max_depth': [3, 5, 7, 10, None],
'min_samples_split': [2, 5, 10, 20],
'min_samples_leaf': [1, 2, 5, 10],
'criterion': ['gini', 'entropy']
}
# Grid Search
grid_search = GridSearchCV(
DecisionTreeClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("Hyperparameter tuning results:")
print(f" Best parameters: {grid_search.best_params_}")
print(f" Best CV score: {grid_search.best_score_:.4f}")
print(f" Test score: {grid_search.score(X_test, y_test):.4f}")
8. Feature Importance¶
# Train with full tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(iris.data, iris.target)
# Feature importance
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
# Visualization
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances[indices])
plt.xticks(range(len(importances)),
[iris.feature_names[i] for i in indices], rotation=45)
plt.ylabel('Feature Importance')
plt.title('Decision Tree Feature Importance')
plt.tight_layout()
plt.show()
print("\nFeature importance ranking:")
for i, idx in enumerate(indices):
print(f" {i+1}. {iris.feature_names[idx]}: {importances[idx]:.4f}")
Practice Problems¶
Problem 1: Basic Classification¶
Train a decision tree on breast cancer data and evaluate.
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, test_size=0.2, random_state=42
)
# Solution
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=cancer.target_names))
Problem 2: Pruning¶
Find optimal alpha using CCP and prune the tree.
# Solution
from sklearn.model_selection import cross_val_score
# Calculate CCP path
clf_full = DecisionTreeClassifier(random_state=42)
clf_full.fit(X_train, y_train)
path = clf_full.cost_complexity_pruning_path(X_train, y_train)
# Find optimal alpha with cross-validation
best_alpha = 0
best_score = 0
for alpha in path.ccp_alphas[::5]: # Sample for efficiency
clf = DecisionTreeClassifier(ccp_alpha=alpha, random_state=42)
scores = cross_val_score(clf, X_train, y_train, cv=5)
if scores.mean() > best_score:
best_score = scores.mean()
best_alpha = alpha
print(f"Optimal Alpha: {best_alpha:.6f}")
print(f"Optimal CV score: {best_score:.4f}")
clf_pruned = DecisionTreeClassifier(ccp_alpha=best_alpha, random_state=42)
clf_pruned.fit(X_train, y_train)
print(f"Test accuracy: {clf_pruned.score(X_test, y_test):.4f}")
Problem 3: Regression Tree¶
Train a regression tree on diabetes data.
from sklearn.datasets import load_diabetes
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
diabetes = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
diabetes.data, diabetes.target, test_size=0.2, random_state=42
)
# Solution
reg = DecisionTreeRegressor(max_depth=4, min_samples_leaf=10, random_state=42)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")
Summary¶
| Concept | Description | Use |
|---|---|---|
| Entropy | Measure of information uncertainty | Split criterion (criterion='entropy') |
| Gini Impurity | Probability of misclassification | Split criterion (criterion='gini') |
| Information Gain | Impurity reduction after split | Select optimal split |
| max_depth | Maximum tree depth | Prevent overfitting |
| min_samples_split | Minimum samples for split | Prevent overfitting |
| min_samples_leaf | Minimum samples in leaf node | Prevent overfitting |
| ccp_alpha | Cost-complexity pruning | Post-pruning |
| feature_importances_ | Feature importance | Feature selection |
Decision Tree Checklist¶
- Apply pruning to prevent overfitting
- Use feature importance for interpretability
- Consider ensemble methods (Random Forest) for instability
- No scaling needed for numerical features
- Categorical features require encoding (for sklearn)