Decision Tree¶

Overview¶

Decision trees are algorithms that make decisions by splitting data according to features into a tree structure. They are intuitive and easy to interpret, making them widely used in practice.

1. Basic Concepts of Decision Trees¶

1.1 Tree Structure¶

"""
Decision tree components:
1. Root Node: First split point
2. Internal Node: Intermediate split points
3. Leaf Node: Final prediction value
4. Split: Data splitting based on features
5. Depth: Distance from root to node

Example: Titanic survival prediction
          [Gender]
         /      \
      Male      Female
       |          |
    [Age]      Survived
    /    \
  <10   >=10
   |      |
 Survived  Died
"""

1.2 Basic Usage¶

import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree import plot_tree, export_text
from sklearn.datasets import load_iris, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=42
)

# Create and train model
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Print tree structure
print("\nTree Structure:")
print(export_text(clf, feature_names=iris.feature_names))

1.3 Tree Visualization¶

# Visualization
plt.figure(figsize=(20, 10))
plot_tree(
    clf,
    feature_names=iris.feature_names,
    class_names=iris.target_names,
    filled=True,
    rounded=True,
    fontsize=10
)
plt.title('Decision Tree - Iris Classification')
plt.tight_layout()
plt.show()

# Feature importance
print("\nFeature Importance:")
for name, importance in zip(iris.feature_names, clf.feature_importances_):
    print(f"  {name}: {importance:.4f}")

2. Split Criteria¶

2.1 Entropy¶

import numpy as np

def entropy(y):
    """Calculate information entropy"""
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities + 1e-10))

# Examples
y_pure = [0, 0, 0, 0, 0]  # Pure node
y_mixed = [0, 0, 1, 1, 1]  # Mixed node
y_balanced = [0, 0, 1, 1]  # Balanced node

print("Entropy examples:")
print(f"  Pure node: {entropy(y_pure):.4f}")  # 0
print(f"  Mixed node [2:3]: {entropy(y_mixed):.4f}")
print(f"  Balanced node [2:2]: {entropy(y_balanced):.4f}")  # 1 (maximum)

2.2 Gini Impurity¶

def gini_impurity(y):
    """Calculate Gini impurity"""
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return 1 - np.sum(probabilities ** 2)

print("\nGini impurity examples:")
print(f"  Pure node: {gini_impurity(y_pure):.4f}")  # 0
print(f"  Mixed node: {gini_impurity(y_mixed):.4f}")
print(f"  Balanced node: {gini_impurity(y_balanced):.4f}")  # 0.5 (maximum)

# Comparison: Entropy vs Gini
"""
- Gini: Faster computation, default
- Entropy: Tends toward more balanced trees
- In practice, little difference
"""

2.3 Information Gain¶

def information_gain(parent, left_child, right_child, criterion='gini'):
    """Calculate information gain"""
    if criterion == 'gini':
        impurity_func = gini_impurity
    else:
        impurity_func = entropy

    # Weighted average impurity
    n = len(left_child) + len(right_child)
    n_left, n_right = len(left_child), len(right_child)

    weighted_impurity = (n_left / n) * impurity_func(left_child) + \
                       (n_right / n) * impurity_func(right_child)

    return impurity_func(parent) - weighted_impurity

# Example: Compare splits
parent = [0, 0, 0, 1, 1, 1]

# Split A: Good split
left_a = [0, 0, 0]
right_a = [1, 1, 1]

# Split B: Bad split
left_b = [0, 0, 1]
right_b = [0, 1, 1]

print("\nInformation gain comparison:")
print(f"  Split A (perfect): {information_gain(parent, left_a, right_a):.4f}")
print(f"  Split B (mixed): {information_gain(parent, left_b, right_b):.4f}")

3. CART Algorithm¶

3.1 Classification Tree¶

from sklearn.tree import DecisionTreeClassifier

# Compare different criteria
criteria = ['gini', 'entropy', 'log_loss']

print("Classification Tree - Criterion Comparison:")
for criterion in criteria:
    clf = DecisionTreeClassifier(criterion=criterion, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"  {criterion}: Accuracy = {accuracy:.4f}, Depth = {clf.get_depth()}")

3.2 Regression Tree¶

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load data
diabetes = load_diabetes()
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    diabetes.data, diabetes.target, test_size=0.2, random_state=42
)

# Regression tree (MSE criterion)
reg = DecisionTreeRegressor(criterion='squared_error', random_state=42)
reg.fit(X_train_r, y_train_r)
y_pred_r = reg.predict(X_test_r)

print("\nRegression Tree Results:")
print(f"  MSE: {mean_squared_error(y_test_r, y_pred_r):.4f}")
print(f"  R²: {r2_score(y_test_r, y_pred_r):.4f}")

# Other criteria
criteria_reg = ['squared_error', 'friedman_mse', 'absolute_error']

print("\nRegression Tree - Criterion Comparison:")
for criterion in criteria_reg:
    reg = DecisionTreeRegressor(criterion=criterion, random_state=42)
    reg.fit(X_train_r, y_train_r)
    y_pred = reg.predict(X_test_r)
    mse = mean_squared_error(y_test_r, y_pred)
    print(f"  {criterion}: MSE = {mse:.4f}")

3.3 Split Search Process¶

"""
CART algorithm split process:

1. For all features:
   - Consider all possible split points
   - Calculate impurity reduction for each split

2. Select optimal split:
   - Choose (feature, split point) with maximum impurity reduction

3. Recursive splitting:
   - Repeat steps 1-2 for each child node
   - Stop when termination condition met

Termination conditions:
- Maximum depth reached
- Node sample count below minimum threshold
- Pure node reached (impurity = 0)
"""

# Simulate split process
def find_best_split(X, y, feature_idx):
    """Find optimal split point for single feature"""
    feature = X[:, feature_idx]
    sorted_indices = np.argsort(feature)

    best_gain = -1
    best_threshold = None

    for i in range(1, len(feature)):
        if feature[sorted_indices[i-1]] == feature[sorted_indices[i]]:
            continue

        threshold = (feature[sorted_indices[i-1]] + feature[sorted_indices[i]]) / 2
        left_mask = feature <= threshold

        if np.sum(left_mask) == 0 or np.sum(~left_mask) == 0:
            continue

        gain = information_gain(y, y[left_mask], y[~left_mask])

        if gain > best_gain:
            best_gain = gain
            best_threshold = threshold

    return best_threshold, best_gain

# Test
print("\nOptimal split point search:")
for i, name in enumerate(iris.feature_names):
    threshold, gain = find_best_split(iris.data, iris.target, i)
    print(f"  {name}: threshold={threshold:.2f}, gain={gain:.4f}")

4. Pruning¶

4.1 Pre-pruning¶

# Limit tree growth with hyperparameters
clf_pruned = DecisionTreeClassifier(
    max_depth=3,              # Maximum depth
    min_samples_split=10,     # Minimum samples required to split
    min_samples_leaf=5,       # Minimum samples in leaf node
    max_features='sqrt',      # Maximum features to consider
    max_leaf_nodes=10,        # Maximum leaf nodes
    random_state=42
)
clf_pruned.fit(X_train, y_train)

print("Pre-pruning results:")
print(f"  Depth: {clf_pruned.get_depth()}")
print(f"  Leaf nodes: {clf_pruned.get_n_leaves()}")
print(f"  Accuracy: {accuracy_score(y_test, clf_pruned.predict(X_test)):.4f}")

4.2 Post-pruning - Cost Complexity Pruning¶

# CCP (Cost Complexity Pruning)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
impurities = path.impurities

print("CCP Alpha path:")
print(f"  Number of alpha values: {len(ccp_alphas)}")

# Generate tree for each alpha
clfs = []
for ccp_alpha in ccp_alphas:
    clf_ccp = DecisionTreeClassifier(ccp_alpha=ccp_alpha, random_state=42)
    clf_ccp.fit(X_train, y_train)
    clfs.append(clf_ccp)

# Node count and depth changes
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
n_leaves = [clf.get_n_leaves() for clf in clfs]
depths = [clf.get_depth() for clf in clfs]

# Visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Alpha vs Accuracy
axes[0].plot(ccp_alphas, train_scores, marker='o', label='Train', drawstyle='steps-post')
axes[0].plot(ccp_alphas, test_scores, marker='o', label='Test', drawstyle='steps-post')
axes[0].set_xlabel('Alpha')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Alpha vs Accuracy')
axes[0].legend()

# Alpha vs Leaf nodes
axes[1].plot(ccp_alphas, n_leaves, marker='o', drawstyle='steps-post')
axes[1].set_xlabel('Alpha')
axes[1].set_ylabel('Number of Leaves')
axes[1].set_title('Alpha vs Number of Leaves')

# Alpha vs Depth
axes[2].plot(ccp_alphas, depths, marker='o', drawstyle='steps-post')
axes[2].set_xlabel('Alpha')
axes[2].set_ylabel('Depth')
axes[2].set_title('Alpha vs Depth')

plt.tight_layout()
plt.show()

# Select optimal alpha (cross-validation)
from sklearn.model_selection import cross_val_score

cv_scores = []
for ccp_alpha in ccp_alphas:
    clf_ccp = DecisionTreeClassifier(ccp_alpha=ccp_alpha, random_state=42)
    scores = cross_val_score(clf_ccp, X_train, y_train, cv=5)
    cv_scores.append(scores.mean())

best_idx = np.argmax(cv_scores)
best_alpha = ccp_alphas[best_idx]
print(f"\nOptimal Alpha: {best_alpha:.6f}")
print(f"Optimal CV score: {cv_scores[best_idx]:.4f}")

5. Decision Boundary Visualization¶

from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import numpy as np

# Generate 2D data
X_2d, y_2d = make_classification(
    n_samples=200, n_features=2, n_redundant=0,
    n_informative=2, n_clusters_per_class=1, random_state=42
)

# Compare trees with different depths
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
depths = [1, 2, 3, 5, 10, None]

for ax, depth in zip(axes.flatten(), depths):
    clf = DecisionTreeClassifier(max_depth=depth, random_state=42)
    clf.fit(X_2d, y_2d)

    # Decision boundary
    x_min, x_max = X_2d[:, 0].min() - 0.5, X_2d[:, 0].max() + 0.5
    y_min, y_max = X_2d[:, 1].min() - 0.5, X_2d[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    ax.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    ax.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, edgecolors='black', cmap='RdYlBu')

    depth_str = depth if depth else 'None'
    ax.set_title(f'Max Depth = {depth_str}\nAccuracy = {clf.score(X_2d, y_2d):.3f}')

plt.tight_layout()
plt.show()

6. Advantages and Disadvantages of Decision Trees¶

6.1 Pros and Cons¶

"""
Advantages:
1. Easy to interpret: Visualize decision-making process
2. Minimal preprocessing: No scaling or normalization needed
3. Nonlinear relationships: Can learn complex nonlinear patterns
4. Various data types: Handle both numerical and categorical
5. Fast prediction: O(log n) time complexity

Disadvantages:
1. Overfitting tendency: Deep trees easily overfit
2. Instability: Sensitive to small data changes
3. Optimization limitations: No global optimum guarantee (greedy)
4. Cannot extrapolate: Difficult to predict outside training range
5. Bias: Sensitive to class imbalance
"""

7. Hyperparameter Tuning¶

from sklearn.model_selection import GridSearchCV

# Hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'criterion': ['gini', 'entropy']
}

# Grid Search
grid_search = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

print("Hyperparameter tuning results:")
print(f"  Best parameters: {grid_search.best_params_}")
print(f"  Best CV score: {grid_search.best_score_:.4f}")
print(f"  Test score: {grid_search.score(X_test, y_test):.4f}")

8. Feature Importance¶

# Train with full tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(iris.data, iris.target)

# Feature importance
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

# Visualization
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances[indices])
plt.xticks(range(len(importances)),
           [iris.feature_names[i] for i in indices], rotation=45)
plt.ylabel('Feature Importance')
plt.title('Decision Tree Feature Importance')
plt.tight_layout()
plt.show()

print("\nFeature importance ranking:")
for i, idx in enumerate(indices):
    print(f"  {i+1}. {iris.feature_names[idx]}: {importances[idx]:.4f}")

Practice Problems¶

Problem 1: Basic Classification¶

Train a decision tree on breast cancer data and evaluate.

from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, test_size=0.2, random_state=42
)

# Solution
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=cancer.target_names))

Problem 2: Pruning¶

Find optimal alpha using CCP and prune the tree.

# Solution
from sklearn.model_selection import cross_val_score

# Calculate CCP path
clf_full = DecisionTreeClassifier(random_state=42)
clf_full.fit(X_train, y_train)
path = clf_full.cost_complexity_pruning_path(X_train, y_train)

# Find optimal alpha with cross-validation
best_alpha = 0
best_score = 0
for alpha in path.ccp_alphas[::5]:  # Sample for efficiency
    clf = DecisionTreeClassifier(ccp_alpha=alpha, random_state=42)
    scores = cross_val_score(clf, X_train, y_train, cv=5)
    if scores.mean() > best_score:
        best_score = scores.mean()
        best_alpha = alpha

print(f"Optimal Alpha: {best_alpha:.6f}")
print(f"Optimal CV score: {best_score:.4f}")

clf_pruned = DecisionTreeClassifier(ccp_alpha=best_alpha, random_state=42)
clf_pruned.fit(X_train, y_train)
print(f"Test accuracy: {clf_pruned.score(X_test, y_test):.4f}")

Problem 3: Regression Tree¶

Train a regression tree on diabetes data.

from sklearn.datasets import load_diabetes
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

diabetes = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
    diabetes.data, diabetes.target, test_size=0.2, random_state=42
)

# Solution
reg = DecisionTreeRegressor(max_depth=4, min_samples_leaf=10, random_state=42)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")

Summary¶

Concept	Description	Use
Entropy	Measure of information uncertainty	Split criterion (criterion='entropy')
Gini Impurity	Probability of misclassification	Split criterion (criterion='gini')
Information Gain	Impurity reduction after split	Select optimal split
max_depth	Maximum tree depth	Prevent overfitting
min_samples_split	Minimum samples for split	Prevent overfitting
min_samples_leaf	Minimum samples in leaf node	Prevent overfitting
ccp_alpha	Cost-complexity pruning	Post-pruning
feature_importances_	Feature importance	Feature selection

Decision Tree Checklist¶

Apply pruning to prevent overfitting
Use feature importance for interpretability
Consider ensemble methods (Random Forest) for instability
No scaling needed for numerical features
Categorical features require encoding (for sklearn)