๊ฒฐ์ • ํŠธ๋ฆฌ (Decision Tree)

๊ฒฐ์ • ํŠธ๋ฆฌ (Decision Tree)

๊ฐœ์š”

๊ฒฐ์ • ํŠธ๋ฆฌ๋Š” ๋ฐ์ดํ„ฐ๋ฅผ ํŠน์„ฑ(feature)์— ๋”ฐ๋ผ ๋ถ„ํ• ํ•˜์—ฌ ํŠธ๋ฆฌ ๊ตฌ์กฐ๋กœ ์˜์‚ฌ๊ฒฐ์ •์„ ์ˆ˜ํ–‰ํ•˜๋Š” ์•Œ๊ณ ๋ฆฌ์ฆ˜์ž…๋‹ˆ๋‹ค. ์ง๊ด€์ ์ด๊ณ  ํ•ด์„์ด ์‰ฌ์›Œ ์‹ค๋ฌด์—์„œ ๋งŽ์ด ์‚ฌ์šฉ๋ฉ๋‹ˆ๋‹ค.


1. ๊ฒฐ์ • ํŠธ๋ฆฌ์˜ ๊ธฐ๋ณธ ๊ฐœ๋…

1.1 ํŠธ๋ฆฌ ๊ตฌ์กฐ

"""
๊ฒฐ์ • ํŠธ๋ฆฌ ๊ตฌ์„ฑ ์š”์†Œ:
1. ๋ฃจํŠธ ๋…ธ๋“œ (Root Node): ์ฒซ ๋ฒˆ์งธ ๋ถ„ํ•  ์ง€์ 
2. ๋‚ด๋ถ€ ๋…ธ๋“œ (Internal Node): ์ค‘๊ฐ„ ๋ถ„ํ•  ์ง€์ 
3. ๋ฆฌํ”„ ๋…ธ๋“œ (Leaf Node): ์ตœ์ข… ์˜ˆ์ธก๊ฐ’
4. ๋ถ„ํ•  (Split): ํŠน์„ฑ์— ๋”ฐ๋ฅธ ๋ฐ์ดํ„ฐ ๋ถ„ํ• 
5. ๊นŠ์ด (Depth): ๋ฃจํŠธ์—์„œ ๋…ธ๋“œ๊นŒ์ง€์˜ ๊ฑฐ๋ฆฌ

์˜ˆ์‹œ: ํƒ€์ดํƒ€๋‹‰ ์ƒ์กด ์˜ˆ์ธก
          [์„ฑ๋ณ„]
         /      \
      ๋‚จ์„ฑ       ์—ฌ์„ฑ
       |          |
    [๋‚˜์ด]      ์ƒ์กด
    /    \
  <10   >=10
   |      |
 ์ƒ์กด   ์‚ฌ๋ง
"""

1.2 ๊ธฐ๋ณธ ์‚ฌ์šฉ๋ฒ•

import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree import plot_tree, export_text
from sklearn.datasets import load_iris, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# ๋ฐ์ดํ„ฐ ๋กœ๋“œ
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=42
)

# ๋ชจ๋ธ ์ƒ์„ฑ ๋ฐ ํ•™์Šต
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# ์˜ˆ์ธก
y_pred = clf.predict(X_test)
print(f"์ •ํ™•๋„: {accuracy_score(y_test, y_pred):.4f}")

# ํŠธ๋ฆฌ ๊ตฌ์กฐ ์ถœ๋ ฅ
print("\nํŠธ๋ฆฌ ๊ตฌ์กฐ:")
print(export_text(clf, feature_names=iris.feature_names))

1.3 ํŠธ๋ฆฌ ์‹œ๊ฐํ™”

# ์‹œ๊ฐํ™”
plt.figure(figsize=(20, 10))
plot_tree(
    clf,
    feature_names=iris.feature_names,
    class_names=iris.target_names,
    filled=True,
    rounded=True,
    fontsize=10
)
plt.title('Decision Tree - Iris Classification')
plt.tight_layout()
plt.show()

# ํŠน์„ฑ ์ค‘์š”๋„
print("\nํŠน์„ฑ ์ค‘์š”๋„:")
for name, importance in zip(iris.feature_names, clf.feature_importances_):
    print(f"  {name}: {importance:.4f}")

2. ๋ถ„ํ•  ๊ธฐ์ค€ (Split Criteria)

2.1 ์—”ํŠธ๋กœํ”ผ (Entropy)

import numpy as np

def entropy(y):
    """์ •๋ณด ์—”ํŠธ๋กœํ”ผ ๊ณ„์‚ฐ"""
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities + 1e-10))

# ์˜ˆ์‹œ
y_pure = [0, 0, 0, 0, 0]  # ์ˆœ์ˆ˜ ๋…ธ๋“œ
y_mixed = [0, 0, 1, 1, 1]  # ํ˜ผํ•ฉ ๋…ธ๋“œ
y_balanced = [0, 0, 1, 1]  # ๊ท ํ˜• ๋…ธ๋“œ

print("์—”ํŠธ๋กœํ”ผ ์˜ˆ์‹œ:")
print(f"  ์ˆœ์ˆ˜ ๋…ธ๋“œ: {entropy(y_pure):.4f}")  # 0
print(f"  ํ˜ผํ•ฉ ๋…ธ๋“œ [2:3]: {entropy(y_mixed):.4f}")
print(f"  ๊ท ํ˜• ๋…ธ๋“œ [2:2]: {entropy(y_balanced):.4f}")  # 1 (์ตœ๋Œ€)

2.2 ์ง€๋‹ˆ ๋ถˆ์ˆœ๋„ (Gini Impurity)

def gini_impurity(y):
    """์ง€๋‹ˆ ๋ถˆ์ˆœ๋„ ๊ณ„์‚ฐ"""
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return 1 - np.sum(probabilities ** 2)

print("\n์ง€๋‹ˆ ๋ถˆ์ˆœ๋„ ์˜ˆ์‹œ:")
print(f"  ์ˆœ์ˆ˜ ๋…ธ๋“œ: {gini_impurity(y_pure):.4f}")  # 0
print(f"  ํ˜ผํ•ฉ ๋…ธ๋“œ: {gini_impurity(y_mixed):.4f}")
print(f"  ๊ท ํ˜• ๋…ธ๋“œ: {gini_impurity(y_balanced):.4f}")  # 0.5 (์ตœ๋Œ€)

# ๋น„๊ต: ์—”ํŠธ๋กœํ”ผ vs ์ง€๋‹ˆ
"""
- Gini: ๊ณ„์‚ฐ์ด ๋น ๋ฆ„, ๊ธฐ๋ณธ๊ฐ’
- Entropy: ๋” ๊ท ํ˜• ์žกํžŒ ํŠธ๋ฆฌ ๊ฒฝํ–ฅ
- ์‹ค์ œ๋กœ ํฐ ์ฐจ์ด ์—†์Œ
"""

2.3 ์ •๋ณด ์ด๋“ (Information Gain)

def information_gain(parent, left_child, right_child, criterion='gini'):
    """์ •๋ณด ์ด๋“ ๊ณ„์‚ฐ"""
    if criterion == 'gini':
        impurity_func = gini_impurity
    else:
        impurity_func = entropy

    # ๊ฐ€์ค‘ ํ‰๊ท  ๋ถˆ์ˆœ๋„
    n = len(left_child) + len(right_child)
    n_left, n_right = len(left_child), len(right_child)

    weighted_impurity = (n_left / n) * impurity_func(left_child) + \
                       (n_right / n) * impurity_func(right_child)

    return impurity_func(parent) - weighted_impurity

# ์˜ˆ์‹œ: ๋ถ„ํ•  ๋น„๊ต
parent = [0, 0, 0, 1, 1, 1]

# ๋ถ„ํ•  A: ์ข‹์€ ๋ถ„ํ• 
left_a = [0, 0, 0]
right_a = [1, 1, 1]

# ๋ถ„ํ•  B: ๋‚˜์œ ๋ถ„ํ• 
left_b = [0, 0, 1]
right_b = [0, 1, 1]

print("\n์ •๋ณด ์ด๋“ ๋น„๊ต:")
print(f"  ๋ถ„ํ•  A (์™„๋ฒฝ): {information_gain(parent, left_a, right_a):.4f}")
print(f"  ๋ถ„ํ•  B (ํ˜ผํ•ฉ): {information_gain(parent, left_b, right_b):.4f}")

3. CART ์•Œ๊ณ ๋ฆฌ์ฆ˜

3.1 ๋ถ„๋ฅ˜ ํŠธ๋ฆฌ (Classification)

from sklearn.tree import DecisionTreeClassifier

# ๋‹ค์–‘ํ•œ criterion ๋น„๊ต
criteria = ['gini', 'entropy', 'log_loss']

print("๋ถ„๋ฅ˜ ํŠธ๋ฆฌ - Criterion ๋น„๊ต:")
for criterion in criteria:
    clf = DecisionTreeClassifier(criterion=criterion, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"  {criterion}: ์ •ํ™•๋„ = {accuracy:.4f}, ๊นŠ์ด = {clf.get_depth()}")

3.2 ํšŒ๊ท€ ํŠธ๋ฆฌ (Regression)

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# ๋ฐ์ดํ„ฐ ๋กœ๋“œ
diabetes = load_diabetes()
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    diabetes.data, diabetes.target, test_size=0.2, random_state=42
)

# ํšŒ๊ท€ ํŠธ๋ฆฌ (MSE ๊ธฐ์ค€)
reg = DecisionTreeRegressor(criterion='squared_error', random_state=42)
reg.fit(X_train_r, y_train_r)
y_pred_r = reg.predict(X_test_r)

print("\nํšŒ๊ท€ ํŠธ๋ฆฌ ๊ฒฐ๊ณผ:")
print(f"  MSE: {mean_squared_error(y_test_r, y_pred_r):.4f}")
print(f"  Rยฒ: {r2_score(y_test_r, y_pred_r):.4f}")

# ๋‹ค๋ฅธ criterion
criteria_reg = ['squared_error', 'friedman_mse', 'absolute_error']

print("\nํšŒ๊ท€ ํŠธ๋ฆฌ - Criterion ๋น„๊ต:")
for criterion in criteria_reg:
    reg = DecisionTreeRegressor(criterion=criterion, random_state=42)
    reg.fit(X_train_r, y_train_r)
    y_pred = reg.predict(X_test_r)
    mse = mean_squared_error(y_test_r, y_pred)
    print(f"  {criterion}: MSE = {mse:.4f}")

3.3 ๋ถ„ํ•  ํƒ์ƒ‰ ๊ณผ์ •

"""
CART ์•Œ๊ณ ๋ฆฌ์ฆ˜ ๋ถ„ํ•  ๊ณผ์ •:

1. ๋ชจ๋“  ํŠน์„ฑ์— ๋Œ€ํ•ด:
   - ๋ชจ๋“  ๊ฐ€๋Šฅํ•œ ๋ถ„ํ• ์  ๊ฒ€ํ† 
   - ๊ฐ ๋ถ„ํ• ์˜ ๋ถˆ์ˆœ๋„ ๊ฐ์†Œ๋Ÿ‰ ๊ณ„์‚ฐ

2. ์ตœ์  ๋ถ„ํ•  ์„ ํƒ:
   - ๊ฐ€์žฅ ํฐ ๋ถˆ์ˆœ๋„ ๊ฐ์†Œ๋ฅผ ์ฃผ๋Š” (ํŠน์„ฑ, ๋ถ„ํ• ์ ) ์„ ํƒ

3. ์žฌ๊ท€์  ๋ถ„ํ• :
   - ๊ฐ ์ž์‹ ๋…ธ๋“œ์— ๋Œ€ํ•ด 1-2 ๋ฐ˜๋ณต
   - ์ข…๋ฃŒ ์กฐ๊ฑด ๋งŒ์กฑ ์‹œ ์ค‘์ง€

์ข…๋ฃŒ ์กฐ๊ฑด:
- ์ตœ๋Œ€ ๊นŠ์ด ๋„๋‹ฌ
- ๋…ธ๋“œ ๋‚ด ์ƒ˜ํ”Œ ์ˆ˜๊ฐ€ ์ตœ์†Œ ๊ธฐ์ค€ ์ดํ•˜
- ์ˆœ์ˆ˜ ๋…ธ๋“œ ๋„๋‹ฌ (๋ถˆ์ˆœ๋„ = 0)
"""

# ๋ถ„ํ•  ๊ณผ์ • ์‹œ๋ฎฌ๋ ˆ์ด์…˜
def find_best_split(X, y, feature_idx):
    """๋‹จ์ผ ํŠน์„ฑ์— ๋Œ€ํ•œ ์ตœ์  ๋ถ„ํ• ์  ์ฐพ๊ธฐ"""
    feature = X[:, feature_idx]
    sorted_indices = np.argsort(feature)

    best_gain = -1
    best_threshold = None

    for i in range(1, len(feature)):
        if feature[sorted_indices[i-1]] == feature[sorted_indices[i]]:
            continue

        threshold = (feature[sorted_indices[i-1]] + feature[sorted_indices[i]]) / 2
        left_mask = feature <= threshold

        if np.sum(left_mask) == 0 or np.sum(~left_mask) == 0:
            continue

        gain = information_gain(y, y[left_mask], y[~left_mask])

        if gain > best_gain:
            best_gain = gain
            best_threshold = threshold

    return best_threshold, best_gain

# ํ…Œ์ŠคํŠธ
print("\n์ตœ์  ๋ถ„ํ• ์  ํƒ์ƒ‰:")
for i, name in enumerate(iris.feature_names):
    threshold, gain = find_best_split(iris.data, iris.target, i)
    print(f"  {name}: threshold={threshold:.2f}, gain={gain:.4f}")

4. ๊ฐ€์ง€์น˜๊ธฐ (Pruning)

4.1 ์‚ฌ์ „ ๊ฐ€์ง€์น˜๊ธฐ (Pre-pruning)

# ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ๋กœ ํŠธ๋ฆฌ ์„ฑ์žฅ ์ œํ•œ
clf_pruned = DecisionTreeClassifier(
    max_depth=3,              # ์ตœ๋Œ€ ๊นŠ์ด
    min_samples_split=10,     # ๋ถ„ํ• ์— ํ•„์š”ํ•œ ์ตœ์†Œ ์ƒ˜ํ”Œ ์ˆ˜
    min_samples_leaf=5,       # ๋ฆฌํ”„ ๋…ธ๋“œ ์ตœ์†Œ ์ƒ˜ํ”Œ ์ˆ˜
    max_features='sqrt',      # ๋ถ„ํ•  ์‹œ ๊ณ ๋ คํ•  ์ตœ๋Œ€ ํŠน์„ฑ ์ˆ˜
    max_leaf_nodes=10,        # ์ตœ๋Œ€ ๋ฆฌํ”„ ๋…ธ๋“œ ์ˆ˜
    random_state=42
)
clf_pruned.fit(X_train, y_train)

print("์‚ฌ์ „ ๊ฐ€์ง€์น˜๊ธฐ ๊ฒฐ๊ณผ:")
print(f"  ๊นŠ์ด: {clf_pruned.get_depth()}")
print(f"  ๋ฆฌํ”„ ๋…ธ๋“œ ์ˆ˜: {clf_pruned.get_n_leaves()}")
print(f"  ์ •ํ™•๋„: {accuracy_score(y_test, clf_pruned.predict(X_test)):.4f}")

4.2 ์‚ฌํ›„ ๊ฐ€์ง€์น˜๊ธฐ (Post-pruning) - Cost Complexity Pruning

# CCP (Cost Complexity Pruning)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
impurities = path.impurities

print("CCP Alpha ๊ฒฝ๋กœ:")
print(f"  Alpha ๊ฐ’ ์ˆ˜: {len(ccp_alphas)}")

# ๊ฐ alpha์— ๋Œ€ํ•œ ํŠธ๋ฆฌ ์ƒ์„ฑ
clfs = []
for ccp_alpha in ccp_alphas:
    clf_ccp = DecisionTreeClassifier(ccp_alpha=ccp_alpha, random_state=42)
    clf_ccp.fit(X_train, y_train)
    clfs.append(clf_ccp)

# ๋…ธ๋“œ ์ˆ˜์™€ ๊นŠ์ด ๋ณ€ํ™”
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
n_leaves = [clf.get_n_leaves() for clf in clfs]
depths = [clf.get_depth() for clf in clfs]

# ์‹œ๊ฐํ™”
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Alpha vs ์ •ํ™•๋„
axes[0].plot(ccp_alphas, train_scores, marker='o', label='Train', drawstyle='steps-post')
axes[0].plot(ccp_alphas, test_scores, marker='o', label='Test', drawstyle='steps-post')
axes[0].set_xlabel('Alpha')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Alpha vs Accuracy')
axes[0].legend()

# Alpha vs ๋ฆฌํ”„ ๋…ธ๋“œ ์ˆ˜
axes[1].plot(ccp_alphas, n_leaves, marker='o', drawstyle='steps-post')
axes[1].set_xlabel('Alpha')
axes[1].set_ylabel('Number of Leaves')
axes[1].set_title('Alpha vs Number of Leaves')

# Alpha vs ๊นŠ์ด
axes[2].plot(ccp_alphas, depths, marker='o', drawstyle='steps-post')
axes[2].set_xlabel('Alpha')
axes[2].set_ylabel('Depth')
axes[2].set_title('Alpha vs Depth')

plt.tight_layout()
plt.show()

# ์ตœ์  alpha ์„ ํƒ (๊ต์ฐจ ๊ฒ€์ฆ)
from sklearn.model_selection import cross_val_score

cv_scores = []
for ccp_alpha in ccp_alphas:
    clf_ccp = DecisionTreeClassifier(ccp_alpha=ccp_alpha, random_state=42)
    scores = cross_val_score(clf_ccp, X_train, y_train, cv=5)
    cv_scores.append(scores.mean())

best_idx = np.argmax(cv_scores)
best_alpha = ccp_alphas[best_idx]
print(f"\n์ตœ์  Alpha: {best_alpha:.6f}")
print(f"์ตœ์  CV ์ ์ˆ˜: {cv_scores[best_idx]:.4f}")

4.3 ๊ฐ€์ง€์น˜๊ธฐ ๋น„๊ต

# ๊ฐ€์ง€์น˜๊ธฐ ์ „/ํ›„ ๋น„๊ต
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# ๊ฐ€์ง€์น˜๊ธฐ ์ „
clf_full = DecisionTreeClassifier(random_state=42)
clf_full.fit(X_train, y_train)

plot_tree(clf_full, feature_names=iris.feature_names,
          class_names=iris.target_names, filled=True, ax=axes[0], fontsize=8)
axes[0].set_title(f'Full Tree (Depth={clf_full.get_depth()}, Leaves={clf_full.get_n_leaves()})\n'
                  f'Accuracy: {accuracy_score(y_test, clf_full.predict(X_test)):.4f}')

# ๊ฐ€์ง€์น˜๊ธฐ ํ›„
clf_pruned = DecisionTreeClassifier(ccp_alpha=best_alpha, random_state=42)
clf_pruned.fit(X_train, y_train)

plot_tree(clf_pruned, feature_names=iris.feature_names,
          class_names=iris.target_names, filled=True, ax=axes[1], fontsize=10)
axes[1].set_title(f'Pruned Tree (Depth={clf_pruned.get_depth()}, Leaves={clf_pruned.get_n_leaves()})\n'
                  f'Accuracy: {accuracy_score(y_test, clf_pruned.predict(X_test)):.4f}')

plt.tight_layout()
plt.show()

5. ๊ฒฐ์ • ๊ฒฝ๊ณ„ ์‹œ๊ฐํ™”

from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import numpy as np

# 2D ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
X_2d, y_2d = make_classification(
    n_samples=200, n_features=2, n_redundant=0,
    n_informative=2, n_clusters_per_class=1, random_state=42
)

# ์—ฌ๋Ÿฌ ๊นŠ์ด์˜ ํŠธ๋ฆฌ ๋น„๊ต
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
depths = [1, 2, 3, 5, 10, None]

for ax, depth in zip(axes.flatten(), depths):
    clf = DecisionTreeClassifier(max_depth=depth, random_state=42)
    clf.fit(X_2d, y_2d)

    # ๊ฒฐ์ • ๊ฒฝ๊ณ„
    x_min, x_max = X_2d[:, 0].min() - 0.5, X_2d[:, 0].max() + 0.5
    y_min, y_max = X_2d[:, 1].min() - 0.5, X_2d[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    ax.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    ax.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, edgecolors='black', cmap='RdYlBu')

    depth_str = depth if depth else 'None'
    ax.set_title(f'Max Depth = {depth_str}\nAccuracy = {clf.score(X_2d, y_2d):.3f}')

plt.tight_layout()
plt.show()

6. ๊ฒฐ์ • ํŠธ๋ฆฌ์˜ ์žฅ๋‹จ์ 

6.1 ์žฅ์ ๊ณผ ๋‹จ์ 

"""
์žฅ์ :
1. ํ•ด์„ ์šฉ์ด: ์‹œ๊ฐํ™”ํ•˜์—ฌ ์˜์‚ฌ๊ฒฐ์ • ๊ณผ์ • ์ดํ•ด ๊ฐ€๋Šฅ
2. ์ „์ฒ˜๋ฆฌ ์ตœ์†Œ: ์Šค์ผ€์ผ๋ง, ์ •๊ทœํ™” ๋ถˆํ•„์š”
3. ๋น„์„ ํ˜• ๊ด€๊ณ„: ๋ณต์žกํ•œ ๋น„์„ ํ˜• ํŒจํ„ด ํ•™์Šต ๊ฐ€๋Šฅ
4. ๋‹ค์–‘ํ•œ ๋ฐ์ดํ„ฐ: ์ˆ˜์น˜ํ˜•, ๋ฒ”์ฃผํ˜• ๋ชจ๋‘ ์ฒ˜๋ฆฌ
5. ๋น ๋ฅธ ์˜ˆ์ธก: O(log n) ์‹œ๊ฐ„ ๋ณต์žก๋„

๋‹จ์ :
1. ๊ณผ์ ํ•ฉ ๊ฒฝํ–ฅ: ๊นŠ์€ ํŠธ๋ฆฌ๋Š” ์‰ฝ๊ฒŒ ๊ณผ์ ํ•ฉ
2. ๋ถˆ์•ˆ์ •์„ฑ: ์ž‘์€ ๋ฐ์ดํ„ฐ ๋ณ€ํ™”์— ๋ฏผ๊ฐ
3. ์ตœ์ ํ™” ํ•œ๊ณ„: ์ „์—ญ ์ตœ์ ํ•ด ๋ณด์žฅ ์•ˆ๋จ (Greedy)
4. ์™ธ์‚ฝ ๋ถˆ๊ฐ€: ํ•™์Šต ๋ฒ”์œ„ ๋ฐ– ์˜ˆ์ธก ์–ด๋ ค์›€
5. ํŽธํ–ฅ: ํด๋ž˜์Šค ๋ถˆ๊ท ํ˜•์— ๋ฏผ๊ฐ
"""

6.2 ๋ถˆ์•ˆ์ •์„ฑ ๋ฐ๋ชจ

# ๋ฐ์ดํ„ฐ ์•ฝ๊ฐ„ ๋ณ€๊ฒฝ ์‹œ ํŠธ๋ฆฌ ๋ณ€ํ™”
np.random.seed(42)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, ax in enumerate(axes):
    # ์•ฝ๊ฐ„ ๋‹ค๋ฅธ ๋žœ๋ค ์‹œ๋“œ๋กœ ๋ฐ์ดํ„ฐ ๋ถ„ํ• 
    X_tr, X_te, y_tr, y_te = train_test_split(
        iris.data[:, :2], iris.target, test_size=0.2, random_state=i
    )

    clf = DecisionTreeClassifier(max_depth=3, random_state=42)
    clf.fit(X_tr, y_tr)

    # ๊ฒฐ์ • ๊ฒฝ๊ณ„
    x_min, x_max = iris.data[:, 0].min() - 0.5, iris.data[:, 0].max() + 0.5
    y_min, y_max = iris.data[:, 1].min() - 0.5, iris.data[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    ax.contourf(xx, yy, Z, alpha=0.3)
    ax.scatter(X_tr[:, 0], X_tr[:, 1], c=y_tr, edgecolors='black')
    ax.set_title(f'Random State = {i}')

plt.suptitle('๊ฒฐ์ • ํŠธ๋ฆฌ์˜ ๋ถˆ์•ˆ์ •์„ฑ: ๋ฐ์ดํ„ฐ ๋ถ„ํ• ์— ๋”ฐ๋ฅธ ๋ณ€ํ™”')
plt.tight_layout()
plt.show()

7. ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹

from sklearn.model_selection import GridSearchCV

# ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ๊ทธ๋ฆฌ๋“œ
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'criterion': ['gini', 'entropy']
}

# Grid Search
grid_search = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

print("ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹ ๊ฒฐ๊ณผ:")
print(f"  ์ตœ์  ํŒŒ๋ผ๋ฏธํ„ฐ: {grid_search.best_params_}")
print(f"  ์ตœ์  CV ์ ์ˆ˜: {grid_search.best_score_:.4f}")
print(f"  ํ…Œ์ŠคํŠธ ์ ์ˆ˜: {grid_search.score(X_test, y_test):.4f}")

8. ํŠน์„ฑ ์ค‘์š”๋„

# ์™„์ „ํ•œ ํŠธ๋ฆฌ๋กœ ํ•™์Šต
clf = DecisionTreeClassifier(random_state=42)
clf.fit(iris.data, iris.target)

# ํŠน์„ฑ ์ค‘์š”๋„
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

# ์‹œ๊ฐํ™”
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances[indices])
plt.xticks(range(len(importances)),
           [iris.feature_names[i] for i in indices], rotation=45)
plt.ylabel('Feature Importance')
plt.title('Decision Tree Feature Importance')
plt.tight_layout()
plt.show()

print("\nํŠน์„ฑ ์ค‘์š”๋„ ์ˆœ์œ„:")
for i, idx in enumerate(indices):
    print(f"  {i+1}. {iris.feature_names[idx]}: {importances[idx]:.4f}")

์—ฐ์Šต ๋ฌธ์ œ

๋ฌธ์ œ 1: ๊ธฐ๋ณธ ๋ถ„๋ฅ˜

์œ ๋ฐฉ์•” ๋ฐ์ดํ„ฐ๋กœ ๊ฒฐ์ • ํŠธ๋ฆฌ๋ฅผ ํ•™์Šตํ•˜๊ณ  ํ‰๊ฐ€ํ•˜์„ธ์š”.

from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, test_size=0.2, random_state=42
)

# ํ’€์ด
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(f"์ •ํ™•๋„: {accuracy_score(y_test, y_pred):.4f}")
print("\n๋ถ„๋ฅ˜ ๋ฆฌํฌํŠธ:")
print(classification_report(y_test, y_pred, target_names=cancer.target_names))

๋ฌธ์ œ 2: ๊ฐ€์ง€์น˜๊ธฐ

CCP๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์ตœ์ ์˜ alpha๋ฅผ ์ฐพ๊ณ  ๊ฐ€์ง€์น˜๊ธฐํ•˜์„ธ์š”.

# ํ’€์ด
from sklearn.model_selection import cross_val_score

# CCP ๊ฒฝ๋กœ ๊ณ„์‚ฐ
clf_full = DecisionTreeClassifier(random_state=42)
clf_full.fit(X_train, y_train)
path = clf_full.cost_complexity_pruning_path(X_train, y_train)

# ๊ต์ฐจ ๊ฒ€์ฆ์œผ๋กœ ์ตœ์  alpha ์ฐพ๊ธฐ
best_alpha = 0
best_score = 0
for alpha in path.ccp_alphas[::5]:  # ํšจ์œจ์„ฑ์„ ์œ„ํ•ด ์ƒ˜ํ”Œ๋ง
    clf = DecisionTreeClassifier(ccp_alpha=alpha, random_state=42)
    scores = cross_val_score(clf, X_train, y_train, cv=5)
    if scores.mean() > best_score:
        best_score = scores.mean()
        best_alpha = alpha

print(f"์ตœ์  Alpha: {best_alpha:.6f}")
print(f"์ตœ์  CV ์ ์ˆ˜: {best_score:.4f}")

clf_pruned = DecisionTreeClassifier(ccp_alpha=best_alpha, random_state=42)
clf_pruned.fit(X_train, y_train)
print(f"ํ…Œ์ŠคํŠธ ์ •ํ™•๋„: {clf_pruned.score(X_test, y_test):.4f}")

๋ฌธ์ œ 3: ํšŒ๊ท€ ํŠธ๋ฆฌ

๋‹น๋‡จ๋ณ‘ ๋ฐ์ดํ„ฐ๋กœ ํšŒ๊ท€ ํŠธ๋ฆฌ๋ฅผ ํ•™์Šตํ•˜์„ธ์š”.

from sklearn.datasets import load_diabetes
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

diabetes = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
    diabetes.data, diabetes.target, test_size=0.2, random_state=42
)

# ํ’€์ด
reg = DecisionTreeRegressor(max_depth=4, min_samples_leaf=10, random_state=42)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"Rยฒ: {r2_score(y_test, y_pred):.4f}")

์š”์•ฝ

๊ฐœ๋… ์„ค๋ช… ์šฉ๋„
์—”ํŠธ๋กœํ”ผ ์ •๋ณด์˜ ๋ถˆํ™•์‹ค์„ฑ ์ธก์ • ๋ถ„ํ•  ๊ธฐ์ค€ (criterion='entropy')
์ง€๋‹ˆ ๋ถˆ์ˆœ๋„ ์ž˜๋ชป ๋ถ„๋ฅ˜๋  ํ™•๋ฅ  ๋ถ„ํ•  ๊ธฐ์ค€ (criterion='gini')
์ •๋ณด ์ด๋“ ๋ถ„ํ•  ํ›„ ๋ถˆ์ˆœ๋„ ๊ฐ์†Œ๋Ÿ‰ ์ตœ์  ๋ถ„ํ•  ์„ ํƒ
max_depth ํŠธ๋ฆฌ ์ตœ๋Œ€ ๊นŠ์ด ๊ณผ์ ํ•ฉ ๋ฐฉ์ง€
min_samples_split ๋ถ„ํ• ์— ํ•„์š”ํ•œ ์ตœ์†Œ ์ƒ˜ํ”Œ ๊ณผ์ ํ•ฉ ๋ฐฉ์ง€
min_samples_leaf ๋ฆฌํ”„ ๋…ธ๋“œ ์ตœ์†Œ ์ƒ˜ํ”Œ ๊ณผ์ ํ•ฉ ๋ฐฉ์ง€
ccp_alpha ๋น„์šฉ-๋ณต์žก๋„ ๊ฐ€์ง€์น˜๊ธฐ ์‚ฌํ›„ ๊ฐ€์ง€์น˜๊ธฐ
feature_importances_ ํŠน์„ฑ ์ค‘์š”๋„ ํŠน์„ฑ ์„ ํƒ

๊ฒฐ์ • ํŠธ๋ฆฌ ์‚ฌ์šฉ ์‹œ ์ฒดํฌ๋ฆฌ์ŠคํŠธ

  1. ๊ณผ์ ํ•ฉ ๋ฐฉ์ง€๋ฅผ ์œ„ํ•ด ๊ฐ€์ง€์น˜๊ธฐ ์ ์šฉ
  2. ์ค‘์š” ํŠน์„ฑ ํ™•์ธ์œผ๋กœ ํ•ด์„ ๊ฐ€๋Šฅ์„ฑ ํ™œ์šฉ
  3. ๋ถˆ์•ˆ์ •์„ฑ ํ•ด๊ฒฐ์„ ์œ„ํ•ด ์•™์ƒ๋ธ”(Random Forest) ๊ณ ๋ ค
  4. ์ˆ˜์น˜ํ˜• ํŠน์„ฑ์€ ์Šค์ผ€์ผ๋ง ๋ถˆํ•„์š”
  5. ๋ฒ”์ฃผํ˜• ํŠน์„ฑ์€ ์ธ์ฝ”๋”ฉ ํ•„์š” (sklearn ๊ธฐ์ค€)
to navigate between lessons