๊ฒฐ์ ํธ๋ฆฌ (Decision Tree)
๊ฒฐ์ ํธ๋ฆฌ (Decision Tree)¶
๊ฐ์¶
๊ฒฐ์ ํธ๋ฆฌ๋ ๋ฐ์ดํฐ๋ฅผ ํน์ฑ(feature)์ ๋ฐ๋ผ ๋ถํ ํ์ฌ ํธ๋ฆฌ ๊ตฌ์กฐ๋ก ์์ฌ๊ฒฐ์ ์ ์ํํ๋ ์๊ณ ๋ฆฌ์ฆ์ ๋๋ค. ์ง๊ด์ ์ด๊ณ ํด์์ด ์ฌ์ ์ค๋ฌด์์ ๋ง์ด ์ฌ์ฉ๋ฉ๋๋ค.
1. ๊ฒฐ์ ํธ๋ฆฌ์ ๊ธฐ๋ณธ ๊ฐ๋ ¶
1.1 ํธ๋ฆฌ ๊ตฌ์กฐ¶
"""
๊ฒฐ์ ํธ๋ฆฌ ๊ตฌ์ฑ ์์:
1. ๋ฃจํธ ๋
ธ๋ (Root Node): ์ฒซ ๋ฒ์งธ ๋ถํ ์ง์
2. ๋ด๋ถ ๋
ธ๋ (Internal Node): ์ค๊ฐ ๋ถํ ์ง์
3. ๋ฆฌํ ๋
ธ๋ (Leaf Node): ์ต์ข
์์ธก๊ฐ
4. ๋ถํ (Split): ํน์ฑ์ ๋ฐ๋ฅธ ๋ฐ์ดํฐ ๋ถํ
5. ๊น์ด (Depth): ๋ฃจํธ์์ ๋
ธ๋๊น์ง์ ๊ฑฐ๋ฆฌ
์์: ํ์ดํ๋ ์์กด ์์ธก
[์ฑ๋ณ]
/ \
๋จ์ฑ ์ฌ์ฑ
| |
[๋์ด] ์์กด
/ \
<10 >=10
| |
์์กด ์ฌ๋ง
"""
1.2 ๊ธฐ๋ณธ ์ฌ์ฉ๋ฒ¶
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree import plot_tree, export_text
from sklearn.datasets import load_iris, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# ๋ฐ์ดํฐ ๋ก๋
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, test_size=0.2, random_state=42
)
# ๋ชจ๋ธ ์์ฑ ๋ฐ ํ์ต
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
# ์์ธก
y_pred = clf.predict(X_test)
print(f"์ ํ๋: {accuracy_score(y_test, y_pred):.4f}")
# ํธ๋ฆฌ ๊ตฌ์กฐ ์ถ๋ ฅ
print("\nํธ๋ฆฌ ๊ตฌ์กฐ:")
print(export_text(clf, feature_names=iris.feature_names))
1.3 ํธ๋ฆฌ ์๊ฐํ¶
# ์๊ฐํ
plt.figure(figsize=(20, 10))
plot_tree(
clf,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True,
rounded=True,
fontsize=10
)
plt.title('Decision Tree - Iris Classification')
plt.tight_layout()
plt.show()
# ํน์ฑ ์ค์๋
print("\nํน์ฑ ์ค์๋:")
for name, importance in zip(iris.feature_names, clf.feature_importances_):
print(f" {name}: {importance:.4f}")
2. ๋ถํ ๊ธฐ์ค (Split Criteria)¶
2.1 ์ํธ๋กํผ (Entropy)¶
import numpy as np
def entropy(y):
"""์ ๋ณด ์ํธ๋กํผ ๊ณ์ฐ"""
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
return -np.sum(probabilities * np.log2(probabilities + 1e-10))
# ์์
y_pure = [0, 0, 0, 0, 0] # ์์ ๋
ธ๋
y_mixed = [0, 0, 1, 1, 1] # ํผํฉ ๋
ธ๋
y_balanced = [0, 0, 1, 1] # ๊ท ํ ๋
ธ๋
print("์ํธ๋กํผ ์์:")
print(f" ์์ ๋
ธ๋: {entropy(y_pure):.4f}") # 0
print(f" ํผํฉ ๋
ธ๋ [2:3]: {entropy(y_mixed):.4f}")
print(f" ๊ท ํ ๋
ธ๋ [2:2]: {entropy(y_balanced):.4f}") # 1 (์ต๋)
2.2 ์ง๋ ๋ถ์๋ (Gini Impurity)¶
def gini_impurity(y):
"""์ง๋ ๋ถ์๋ ๊ณ์ฐ"""
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
return 1 - np.sum(probabilities ** 2)
print("\n์ง๋ ๋ถ์๋ ์์:")
print(f" ์์ ๋
ธ๋: {gini_impurity(y_pure):.4f}") # 0
print(f" ํผํฉ ๋
ธ๋: {gini_impurity(y_mixed):.4f}")
print(f" ๊ท ํ ๋
ธ๋: {gini_impurity(y_balanced):.4f}") # 0.5 (์ต๋)
# ๋น๊ต: ์ํธ๋กํผ vs ์ง๋
"""
- Gini: ๊ณ์ฐ์ด ๋น ๋ฆ, ๊ธฐ๋ณธ๊ฐ
- Entropy: ๋ ๊ท ํ ์กํ ํธ๋ฆฌ ๊ฒฝํฅ
- ์ค์ ๋ก ํฐ ์ฐจ์ด ์์
"""
2.3 ์ ๋ณด ์ด๋ (Information Gain)¶
def information_gain(parent, left_child, right_child, criterion='gini'):
"""์ ๋ณด ์ด๋ ๊ณ์ฐ"""
if criterion == 'gini':
impurity_func = gini_impurity
else:
impurity_func = entropy
# ๊ฐ์ค ํ๊ท ๋ถ์๋
n = len(left_child) + len(right_child)
n_left, n_right = len(left_child), len(right_child)
weighted_impurity = (n_left / n) * impurity_func(left_child) + \
(n_right / n) * impurity_func(right_child)
return impurity_func(parent) - weighted_impurity
# ์์: ๋ถํ ๋น๊ต
parent = [0, 0, 0, 1, 1, 1]
# ๋ถํ A: ์ข์ ๋ถํ
left_a = [0, 0, 0]
right_a = [1, 1, 1]
# ๋ถํ B: ๋์ ๋ถํ
left_b = [0, 0, 1]
right_b = [0, 1, 1]
print("\n์ ๋ณด ์ด๋ ๋น๊ต:")
print(f" ๋ถํ A (์๋ฒฝ): {information_gain(parent, left_a, right_a):.4f}")
print(f" ๋ถํ B (ํผํฉ): {information_gain(parent, left_b, right_b):.4f}")
3. CART ์๊ณ ๋ฆฌ์ฆ¶
3.1 ๋ถ๋ฅ ํธ๋ฆฌ (Classification)¶
from sklearn.tree import DecisionTreeClassifier
# ๋ค์ํ criterion ๋น๊ต
criteria = ['gini', 'entropy', 'log_loss']
print("๋ถ๋ฅ ํธ๋ฆฌ - Criterion ๋น๊ต:")
for criterion in criteria:
clf = DecisionTreeClassifier(criterion=criterion, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f" {criterion}: ์ ํ๋ = {accuracy:.4f}, ๊น์ด = {clf.get_depth()}")
3.2 ํ๊ท ํธ๋ฆฌ (Regression)¶
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
# ๋ฐ์ดํฐ ๋ก๋
diabetes = load_diabetes()
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
diabetes.data, diabetes.target, test_size=0.2, random_state=42
)
# ํ๊ท ํธ๋ฆฌ (MSE ๊ธฐ์ค)
reg = DecisionTreeRegressor(criterion='squared_error', random_state=42)
reg.fit(X_train_r, y_train_r)
y_pred_r = reg.predict(X_test_r)
print("\nํ๊ท ํธ๋ฆฌ ๊ฒฐ๊ณผ:")
print(f" MSE: {mean_squared_error(y_test_r, y_pred_r):.4f}")
print(f" Rยฒ: {r2_score(y_test_r, y_pred_r):.4f}")
# ๋ค๋ฅธ criterion
criteria_reg = ['squared_error', 'friedman_mse', 'absolute_error']
print("\nํ๊ท ํธ๋ฆฌ - Criterion ๋น๊ต:")
for criterion in criteria_reg:
reg = DecisionTreeRegressor(criterion=criterion, random_state=42)
reg.fit(X_train_r, y_train_r)
y_pred = reg.predict(X_test_r)
mse = mean_squared_error(y_test_r, y_pred)
print(f" {criterion}: MSE = {mse:.4f}")
3.3 ๋ถํ ํ์ ๊ณผ์ ¶
"""
CART ์๊ณ ๋ฆฌ์ฆ ๋ถํ ๊ณผ์ :
1. ๋ชจ๋ ํน์ฑ์ ๋ํด:
- ๋ชจ๋ ๊ฐ๋ฅํ ๋ถํ ์ ๊ฒํ
- ๊ฐ ๋ถํ ์ ๋ถ์๋ ๊ฐ์๋ ๊ณ์ฐ
2. ์ต์ ๋ถํ ์ ํ:
- ๊ฐ์ฅ ํฐ ๋ถ์๋ ๊ฐ์๋ฅผ ์ฃผ๋ (ํน์ฑ, ๋ถํ ์ ) ์ ํ
3. ์ฌ๊ท์ ๋ถํ :
- ๊ฐ ์์ ๋
ธ๋์ ๋ํด 1-2 ๋ฐ๋ณต
- ์ข
๋ฃ ์กฐ๊ฑด ๋ง์กฑ ์ ์ค์ง
์ข
๋ฃ ์กฐ๊ฑด:
- ์ต๋ ๊น์ด ๋๋ฌ
- ๋
ธ๋ ๋ด ์ํ ์๊ฐ ์ต์ ๊ธฐ์ค ์ดํ
- ์์ ๋
ธ๋ ๋๋ฌ (๋ถ์๋ = 0)
"""
# ๋ถํ ๊ณผ์ ์๋ฎฌ๋ ์ด์
def find_best_split(X, y, feature_idx):
"""๋จ์ผ ํน์ฑ์ ๋ํ ์ต์ ๋ถํ ์ ์ฐพ๊ธฐ"""
feature = X[:, feature_idx]
sorted_indices = np.argsort(feature)
best_gain = -1
best_threshold = None
for i in range(1, len(feature)):
if feature[sorted_indices[i-1]] == feature[sorted_indices[i]]:
continue
threshold = (feature[sorted_indices[i-1]] + feature[sorted_indices[i]]) / 2
left_mask = feature <= threshold
if np.sum(left_mask) == 0 or np.sum(~left_mask) == 0:
continue
gain = information_gain(y, y[left_mask], y[~left_mask])
if gain > best_gain:
best_gain = gain
best_threshold = threshold
return best_threshold, best_gain
# ํ
์คํธ
print("\n์ต์ ๋ถํ ์ ํ์:")
for i, name in enumerate(iris.feature_names):
threshold, gain = find_best_split(iris.data, iris.target, i)
print(f" {name}: threshold={threshold:.2f}, gain={gain:.4f}")
4. ๊ฐ์ง์น๊ธฐ (Pruning)¶
4.1 ์ฌ์ ๊ฐ์ง์น๊ธฐ (Pre-pruning)¶
# ํ์ดํผํ๋ผ๋ฏธํฐ๋ก ํธ๋ฆฌ ์ฑ์ฅ ์ ํ
clf_pruned = DecisionTreeClassifier(
max_depth=3, # ์ต๋ ๊น์ด
min_samples_split=10, # ๋ถํ ์ ํ์ํ ์ต์ ์ํ ์
min_samples_leaf=5, # ๋ฆฌํ ๋
ธ๋ ์ต์ ์ํ ์
max_features='sqrt', # ๋ถํ ์ ๊ณ ๋ คํ ์ต๋ ํน์ฑ ์
max_leaf_nodes=10, # ์ต๋ ๋ฆฌํ ๋
ธ๋ ์
random_state=42
)
clf_pruned.fit(X_train, y_train)
print("์ฌ์ ๊ฐ์ง์น๊ธฐ ๊ฒฐ๊ณผ:")
print(f" ๊น์ด: {clf_pruned.get_depth()}")
print(f" ๋ฆฌํ ๋
ธ๋ ์: {clf_pruned.get_n_leaves()}")
print(f" ์ ํ๋: {accuracy_score(y_test, clf_pruned.predict(X_test)):.4f}")
4.2 ์ฌํ ๊ฐ์ง์น๊ธฐ (Post-pruning) - Cost Complexity Pruning¶
# CCP (Cost Complexity Pruning)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
impurities = path.impurities
print("CCP Alpha ๊ฒฝ๋ก:")
print(f" Alpha ๊ฐ ์: {len(ccp_alphas)}")
# ๊ฐ alpha์ ๋ํ ํธ๋ฆฌ ์์ฑ
clfs = []
for ccp_alpha in ccp_alphas:
clf_ccp = DecisionTreeClassifier(ccp_alpha=ccp_alpha, random_state=42)
clf_ccp.fit(X_train, y_train)
clfs.append(clf_ccp)
# ๋
ธ๋ ์์ ๊น์ด ๋ณํ
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
n_leaves = [clf.get_n_leaves() for clf in clfs]
depths = [clf.get_depth() for clf in clfs]
# ์๊ฐํ
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# Alpha vs ์ ํ๋
axes[0].plot(ccp_alphas, train_scores, marker='o', label='Train', drawstyle='steps-post')
axes[0].plot(ccp_alphas, test_scores, marker='o', label='Test', drawstyle='steps-post')
axes[0].set_xlabel('Alpha')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Alpha vs Accuracy')
axes[0].legend()
# Alpha vs ๋ฆฌํ ๋
ธ๋ ์
axes[1].plot(ccp_alphas, n_leaves, marker='o', drawstyle='steps-post')
axes[1].set_xlabel('Alpha')
axes[1].set_ylabel('Number of Leaves')
axes[1].set_title('Alpha vs Number of Leaves')
# Alpha vs ๊น์ด
axes[2].plot(ccp_alphas, depths, marker='o', drawstyle='steps-post')
axes[2].set_xlabel('Alpha')
axes[2].set_ylabel('Depth')
axes[2].set_title('Alpha vs Depth')
plt.tight_layout()
plt.show()
# ์ต์ alpha ์ ํ (๊ต์ฐจ ๊ฒ์ฆ)
from sklearn.model_selection import cross_val_score
cv_scores = []
for ccp_alpha in ccp_alphas:
clf_ccp = DecisionTreeClassifier(ccp_alpha=ccp_alpha, random_state=42)
scores = cross_val_score(clf_ccp, X_train, y_train, cv=5)
cv_scores.append(scores.mean())
best_idx = np.argmax(cv_scores)
best_alpha = ccp_alphas[best_idx]
print(f"\n์ต์ Alpha: {best_alpha:.6f}")
print(f"์ต์ CV ์ ์: {cv_scores[best_idx]:.4f}")
4.3 ๊ฐ์ง์น๊ธฐ ๋น๊ต¶
# ๊ฐ์ง์น๊ธฐ ์ /ํ ๋น๊ต
fig, axes = plt.subplots(1, 2, figsize=(20, 8))
# ๊ฐ์ง์น๊ธฐ ์
clf_full = DecisionTreeClassifier(random_state=42)
clf_full.fit(X_train, y_train)
plot_tree(clf_full, feature_names=iris.feature_names,
class_names=iris.target_names, filled=True, ax=axes[0], fontsize=8)
axes[0].set_title(f'Full Tree (Depth={clf_full.get_depth()}, Leaves={clf_full.get_n_leaves()})\n'
f'Accuracy: {accuracy_score(y_test, clf_full.predict(X_test)):.4f}')
# ๊ฐ์ง์น๊ธฐ ํ
clf_pruned = DecisionTreeClassifier(ccp_alpha=best_alpha, random_state=42)
clf_pruned.fit(X_train, y_train)
plot_tree(clf_pruned, feature_names=iris.feature_names,
class_names=iris.target_names, filled=True, ax=axes[1], fontsize=10)
axes[1].set_title(f'Pruned Tree (Depth={clf_pruned.get_depth()}, Leaves={clf_pruned.get_n_leaves()})\n'
f'Accuracy: {accuracy_score(y_test, clf_pruned.predict(X_test)):.4f}')
plt.tight_layout()
plt.show()
5. ๊ฒฐ์ ๊ฒฝ๊ณ ์๊ฐํ¶
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import numpy as np
# 2D ๋ฐ์ดํฐ ์์ฑ
X_2d, y_2d = make_classification(
n_samples=200, n_features=2, n_redundant=0,
n_informative=2, n_clusters_per_class=1, random_state=42
)
# ์ฌ๋ฌ ๊น์ด์ ํธ๋ฆฌ ๋น๊ต
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
depths = [1, 2, 3, 5, 10, None]
for ax, depth in zip(axes.flatten(), depths):
clf = DecisionTreeClassifier(max_depth=depth, random_state=42)
clf.fit(X_2d, y_2d)
# ๊ฒฐ์ ๊ฒฝ๊ณ
x_min, x_max = X_2d[:, 0].min() - 0.5, X_2d[:, 0].max() + 0.5
y_min, y_max = X_2d[:, 1].min() - 0.5, X_2d[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
ax.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, edgecolors='black', cmap='RdYlBu')
depth_str = depth if depth else 'None'
ax.set_title(f'Max Depth = {depth_str}\nAccuracy = {clf.score(X_2d, y_2d):.3f}')
plt.tight_layout()
plt.show()
6. ๊ฒฐ์ ํธ๋ฆฌ์ ์ฅ๋จ์ ¶
6.1 ์ฅ์ ๊ณผ ๋จ์ ¶
"""
์ฅ์ :
1. ํด์ ์ฉ์ด: ์๊ฐํํ์ฌ ์์ฌ๊ฒฐ์ ๊ณผ์ ์ดํด ๊ฐ๋ฅ
2. ์ ์ฒ๋ฆฌ ์ต์: ์ค์ผ์ผ๋ง, ์ ๊ทํ ๋ถํ์
3. ๋น์ ํ ๊ด๊ณ: ๋ณต์กํ ๋น์ ํ ํจํด ํ์ต ๊ฐ๋ฅ
4. ๋ค์ํ ๋ฐ์ดํฐ: ์์นํ, ๋ฒ์ฃผํ ๋ชจ๋ ์ฒ๋ฆฌ
5. ๋น ๋ฅธ ์์ธก: O(log n) ์๊ฐ ๋ณต์ก๋
๋จ์ :
1. ๊ณผ์ ํฉ ๊ฒฝํฅ: ๊น์ ํธ๋ฆฌ๋ ์ฝ๊ฒ ๊ณผ์ ํฉ
2. ๋ถ์์ ์ฑ: ์์ ๋ฐ์ดํฐ ๋ณํ์ ๋ฏผ๊ฐ
3. ์ต์ ํ ํ๊ณ: ์ ์ญ ์ต์ ํด ๋ณด์ฅ ์๋จ (Greedy)
4. ์ธ์ฝ ๋ถ๊ฐ: ํ์ต ๋ฒ์ ๋ฐ ์์ธก ์ด๋ ค์
5. ํธํฅ: ํด๋์ค ๋ถ๊ท ํ์ ๋ฏผ๊ฐ
"""
6.2 ๋ถ์์ ์ฑ ๋ฐ๋ชจ¶
# ๋ฐ์ดํฐ ์ฝ๊ฐ ๋ณ๊ฒฝ ์ ํธ๋ฆฌ ๋ณํ
np.random.seed(42)
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for i, ax in enumerate(axes):
# ์ฝ๊ฐ ๋ค๋ฅธ ๋๋ค ์๋๋ก ๋ฐ์ดํฐ ๋ถํ
X_tr, X_te, y_tr, y_te = train_test_split(
iris.data[:, :2], iris.target, test_size=0.2, random_state=i
)
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf.fit(X_tr, y_tr)
# ๊ฒฐ์ ๊ฒฝ๊ณ
x_min, x_max = iris.data[:, 0].min() - 0.5, iris.data[:, 0].max() + 0.5
y_min, y_max = iris.data[:, 1].min() - 0.5, iris.data[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, alpha=0.3)
ax.scatter(X_tr[:, 0], X_tr[:, 1], c=y_tr, edgecolors='black')
ax.set_title(f'Random State = {i}')
plt.suptitle('๊ฒฐ์ ํธ๋ฆฌ์ ๋ถ์์ ์ฑ: ๋ฐ์ดํฐ ๋ถํ ์ ๋ฐ๋ฅธ ๋ณํ')
plt.tight_layout()
plt.show()
7. ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋¶
from sklearn.model_selection import GridSearchCV
# ํ์ดํผํ๋ผ๋ฏธํฐ ๊ทธ๋ฆฌ๋
param_grid = {
'max_depth': [3, 5, 7, 10, None],
'min_samples_split': [2, 5, 10, 20],
'min_samples_leaf': [1, 2, 5, 10],
'criterion': ['gini', 'entropy']
}
# Grid Search
grid_search = GridSearchCV(
DecisionTreeClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋ ๊ฒฐ๊ณผ:")
print(f" ์ต์ ํ๋ผ๋ฏธํฐ: {grid_search.best_params_}")
print(f" ์ต์ CV ์ ์: {grid_search.best_score_:.4f}")
print(f" ํ
์คํธ ์ ์: {grid_search.score(X_test, y_test):.4f}")
8. ํน์ฑ ์ค์๋¶
# ์์ ํ ํธ๋ฆฌ๋ก ํ์ต
clf = DecisionTreeClassifier(random_state=42)
clf.fit(iris.data, iris.target)
# ํน์ฑ ์ค์๋
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
# ์๊ฐํ
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances[indices])
plt.xticks(range(len(importances)),
[iris.feature_names[i] for i in indices], rotation=45)
plt.ylabel('Feature Importance')
plt.title('Decision Tree Feature Importance')
plt.tight_layout()
plt.show()
print("\nํน์ฑ ์ค์๋ ์์:")
for i, idx in enumerate(indices):
print(f" {i+1}. {iris.feature_names[idx]}: {importances[idx]:.4f}")
์ฐ์ต ๋ฌธ์ ¶
๋ฌธ์ 1: ๊ธฐ๋ณธ ๋ถ๋ฅ¶
์ ๋ฐฉ์ ๋ฐ์ดํฐ๋ก ๊ฒฐ์ ํธ๋ฆฌ๋ฅผ ํ์ตํ๊ณ ํ๊ฐํ์ธ์.
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, test_size=0.2, random_state=42
)
# ํ์ด
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"์ ํ๋: {accuracy_score(y_test, y_pred):.4f}")
print("\n๋ถ๋ฅ ๋ฆฌํฌํธ:")
print(classification_report(y_test, y_pred, target_names=cancer.target_names))
๋ฌธ์ 2: ๊ฐ์ง์น๊ธฐ¶
CCP๋ฅผ ์ฌ์ฉํ์ฌ ์ต์ ์ alpha๋ฅผ ์ฐพ๊ณ ๊ฐ์ง์น๊ธฐํ์ธ์.
# ํ์ด
from sklearn.model_selection import cross_val_score
# CCP ๊ฒฝ๋ก ๊ณ์ฐ
clf_full = DecisionTreeClassifier(random_state=42)
clf_full.fit(X_train, y_train)
path = clf_full.cost_complexity_pruning_path(X_train, y_train)
# ๊ต์ฐจ ๊ฒ์ฆ์ผ๋ก ์ต์ alpha ์ฐพ๊ธฐ
best_alpha = 0
best_score = 0
for alpha in path.ccp_alphas[::5]: # ํจ์จ์ฑ์ ์ํด ์ํ๋ง
clf = DecisionTreeClassifier(ccp_alpha=alpha, random_state=42)
scores = cross_val_score(clf, X_train, y_train, cv=5)
if scores.mean() > best_score:
best_score = scores.mean()
best_alpha = alpha
print(f"์ต์ Alpha: {best_alpha:.6f}")
print(f"์ต์ CV ์ ์: {best_score:.4f}")
clf_pruned = DecisionTreeClassifier(ccp_alpha=best_alpha, random_state=42)
clf_pruned.fit(X_train, y_train)
print(f"ํ
์คํธ ์ ํ๋: {clf_pruned.score(X_test, y_test):.4f}")
๋ฌธ์ 3: ํ๊ท ํธ๋ฆฌ¶
๋น๋จ๋ณ ๋ฐ์ดํฐ๋ก ํ๊ท ํธ๋ฆฌ๋ฅผ ํ์ตํ์ธ์.
from sklearn.datasets import load_diabetes
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
diabetes = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
diabetes.data, diabetes.target, test_size=0.2, random_state=42
)
# ํ์ด
reg = DecisionTreeRegressor(max_depth=4, min_samples_leaf=10, random_state=42)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"Rยฒ: {r2_score(y_test, y_pred):.4f}")
์์ฝ¶
| ๊ฐ๋ | ์ค๋ช | ์ฉ๋ |
|---|---|---|
| ์ํธ๋กํผ | ์ ๋ณด์ ๋ถํ์ค์ฑ ์ธก์ | ๋ถํ ๊ธฐ์ค (criterion='entropy') |
| ์ง๋ ๋ถ์๋ | ์๋ชป ๋ถ๋ฅ๋ ํ๋ฅ | ๋ถํ ๊ธฐ์ค (criterion='gini') |
| ์ ๋ณด ์ด๋ | ๋ถํ ํ ๋ถ์๋ ๊ฐ์๋ | ์ต์ ๋ถํ ์ ํ |
| max_depth | ํธ๋ฆฌ ์ต๋ ๊น์ด | ๊ณผ์ ํฉ ๋ฐฉ์ง |
| min_samples_split | ๋ถํ ์ ํ์ํ ์ต์ ์ํ | ๊ณผ์ ํฉ ๋ฐฉ์ง |
| min_samples_leaf | ๋ฆฌํ ๋ ธ๋ ์ต์ ์ํ | ๊ณผ์ ํฉ ๋ฐฉ์ง |
| ccp_alpha | ๋น์ฉ-๋ณต์ก๋ ๊ฐ์ง์น๊ธฐ | ์ฌํ ๊ฐ์ง์น๊ธฐ |
| feature_importances_ | ํน์ฑ ์ค์๋ | ํน์ฑ ์ ํ |
๊ฒฐ์ ํธ๋ฆฌ ์ฌ์ฉ ์ ์ฒดํฌ๋ฆฌ์คํธ¶
- ๊ณผ์ ํฉ ๋ฐฉ์ง๋ฅผ ์ํด ๊ฐ์ง์น๊ธฐ ์ ์ฉ
- ์ค์ ํน์ฑ ํ์ธ์ผ๋ก ํด์ ๊ฐ๋ฅ์ฑ ํ์ฉ
- ๋ถ์์ ์ฑ ํด๊ฒฐ์ ์ํด ์์๋ธ(Random Forest) ๊ณ ๋ ค
- ์์นํ ํน์ฑ์ ์ค์ผ์ผ๋ง ๋ถํ์
- ๋ฒ์ฃผํ ํน์ฑ์ ์ธ์ฝ๋ฉ ํ์ (sklearn ๊ธฐ์ค)