1{
2 "cells": [
3 {
4 "cell_type": "markdown",
5 "metadata": {},
6 "source": [
7 "# 07. ๋๋ค ํฌ๋ ์คํธ (Random Forest)\n",
8 "\n",
9 "## ํ์ต ๋ชฉํ\n",
10 "- ์์๋ธ ํ์ต๊ณผ ๋ฐฐ๊น
์ดํด\n",
11 "- ๋๋ค ํฌ๋ ์คํธ ์๋ ์๋ฆฌ\n",
12 "- ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋"
13 ]
14 },
15 {
16 "cell_type": "code",
17 "execution_count": null,
18 "metadata": {},
19 "outputs": [],
20 "source": [
21 "import numpy as np\n",
22 "import pandas as pd\n",
23 "import matplotlib.pyplot as plt\n",
24 "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n",
25 "from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV\n",
26 "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
27 "from sklearn.datasets import load_iris, load_wine, fetch_california_housing\n",
28 "import seaborn as sns\n",
29 "\n",
30 "plt.rcParams['font.family'] = 'DejaVu Sans'"
31 ]
32 },
33 {
34 "cell_type": "markdown",
35 "metadata": {},
36 "source": [
37 "## 1. ๋๋ค ํฌ๋ ์คํธ ๋ถ๋ฅ"
38 ]
39 },
40 {
41 "cell_type": "code",
42 "execution_count": null,
43 "metadata": {},
44 "outputs": [],
45 "source": [
46 "# Wine ๋ฐ์ดํฐ์
๋ก๋\n",
47 "wine = load_wine()\n",
48 "X, y = wine.data, wine.target\n",
49 "\n",
50 "print(f\"Features: {wine.feature_names}\")\n",
51 "print(f\"Classes: {wine.target_names}\")\n",
52 "print(f\"Shape: {X.shape}\")\n",
53 "\n",
54 "X_train, X_test, y_train, y_test = train_test_split(\n",
55 " X, y, test_size=0.3, random_state=42\n",
56 ")"
57 ]
58 },
59 {
60 "cell_type": "code",
61 "execution_count": null,
62 "metadata": {},
63 "outputs": [],
64 "source": [
65 "# ๋๋ค ํฌ๋ ์คํธ ๋ชจ๋ธ\n",
66 "rf_clf = RandomForestClassifier(\n",
67 " n_estimators=100,\n",
68 " max_depth=5,\n",
69 " random_state=42,\n",
70 " n_jobs=-1\n",
71 ")\n",
72 "rf_clf.fit(X_train, y_train)\n",
73 "\n",
74 "y_pred = rf_clf.predict(X_test)\n",
75 "print(f\"Accuracy: {accuracy_score(y_test, y_pred):.4f}\")\n",
76 "print(f\"\\nClassification Report:\")\n",
77 "print(classification_report(y_test, y_pred, target_names=wine.target_names))"
78 ]
79 },
80 {
81 "cell_type": "markdown",
82 "metadata": {},
83 "source": [
84 "## 2. ํธ๋ฆฌ ๊ฐ์์ ๋ฐ๋ฅธ ์ฑ๋ฅ"
85 ]
86 },
87 {
88 "cell_type": "code",
89 "execution_count": null,
90 "metadata": {},
91 "outputs": [],
92 "source": [
93 "# ํธ๋ฆฌ ๊ฐ์์ ๋ฐ๋ฅธ ์ฑ๋ฅ ๋ณํ\n",
94 "n_trees = [1, 5, 10, 20, 50, 100, 200, 500]\n",
95 "train_scores = []\n",
96 "test_scores = []\n",
97 "oob_scores = []\n",
98 "\n",
99 "for n in n_trees:\n",
100 " rf = RandomForestClassifier(n_estimators=n, random_state=42, oob_score=True)\n",
101 " rf.fit(X_train, y_train)\n",
102 " train_scores.append(rf.score(X_train, y_train))\n",
103 " test_scores.append(rf.score(X_test, y_test))\n",
104 " oob_scores.append(rf.oob_score_)\n",
105 "\n",
106 "plt.figure(figsize=(10, 6))\n",
107 "plt.plot(n_trees, train_scores, 'b-o', label='Train Score')\n",
108 "plt.plot(n_trees, test_scores, 'r-o', label='Test Score')\n",
109 "plt.plot(n_trees, oob_scores, 'g-o', label='OOB Score')\n",
110 "plt.xlabel('Number of Trees')\n",
111 "plt.ylabel('Accuracy')\n",
112 "plt.title('Random Forest: Performance vs Number of Trees')\n",
113 "plt.legend()\n",
114 "plt.grid(True, alpha=0.3)\n",
115 "plt.xscale('log')\n",
116 "plt.show()"
117 ]
118 },
119 {
120 "cell_type": "markdown",
121 "metadata": {},
122 "source": [
123 "## 3. ํน์ฑ ์ค์๋"
124 ]
125 },
126 {
127 "cell_type": "code",
128 "execution_count": null,
129 "metadata": {},
130 "outputs": [],
131 "source": [
132 "# ํน์ฑ ์ค์๋\n",
133 "importance = pd.DataFrame({\n",
134 " 'Feature': wine.feature_names,\n",
135 " 'Importance': rf_clf.feature_importances_\n",
136 "}).sort_values('Importance', ascending=True)\n",
137 "\n",
138 "plt.figure(figsize=(12, 8))\n",
139 "plt.barh(importance['Feature'], importance['Importance'])\n",
140 "plt.xlabel('Feature Importance')\n",
141 "plt.title('Random Forest Feature Importance - Wine Dataset')\n",
142 "plt.grid(True, alpha=0.3)\n",
143 "plt.tight_layout()\n",
144 "plt.show()"
145 ]
146 },
147 {
148 "cell_type": "markdown",
149 "metadata": {},
150 "source": [
151 "## 4. ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋"
152 ]
153 },
154 {
155 "cell_type": "code",
156 "execution_count": null,
157 "metadata": {},
158 "outputs": [],
159 "source": [
160 "param_grid = {\n",
161 " 'n_estimators': [50, 100, 200],\n",
162 " 'max_depth': [3, 5, 7, None],\n",
163 " 'min_samples_split': [2, 5, 10],\n",
164 " 'max_features': ['sqrt', 'log2', None]\n",
165 "}\n",
166 "\n",
167 "grid_search = GridSearchCV(\n",
168 " RandomForestClassifier(random_state=42, n_jobs=-1),\n",
169 " param_grid,\n",
170 " cv=5,\n",
171 " scoring='accuracy',\n",
172 " n_jobs=-1\n",
173 ")\n",
174 "\n",
175 "grid_search.fit(X_train, y_train)\n",
176 "\n",
177 "print(f\"Best Parameters: {grid_search.best_params_}\")\n",
178 "print(f\"Best CV Score: {grid_search.best_score_:.4f}\")\n",
179 "print(f\"Test Score: {grid_search.score(X_test, y_test):.4f}\")"
180 ]
181 },
182 {
183 "cell_type": "markdown",
184 "metadata": {},
185 "source": [
186 "## 5. ๊ฒฐ์ ํธ๋ฆฌ vs ๋๋ค ํฌ๋ ์คํธ ๋น๊ต"
187 ]
188 },
189 {
190 "cell_type": "code",
191 "execution_count": null,
192 "metadata": {},
193 "outputs": [],
194 "source": [
195 "from sklearn.tree import DecisionTreeClassifier\n",
196 "\n",
197 "# ๋ ๋ชจ๋ธ ๋น๊ต\n",
198 "dt = DecisionTreeClassifier(max_depth=5, random_state=42)\n",
199 "rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)\n",
200 "\n",
201 "dt_scores = cross_val_score(dt, X, y, cv=10)\n",
202 "rf_scores = cross_val_score(rf, X, y, cv=10)\n",
203 "\n",
204 "print(f\"Decision Tree: {dt_scores.mean():.4f} (+/- {dt_scores.std()*2:.4f})\")\n",
205 "print(f\"Random Forest: {rf_scores.mean():.4f} (+/- {rf_scores.std()*2:.4f})\")\n",
206 "\n",
207 "# ๋ฐ์คํ๋กฏ ๋น๊ต\n",
208 "plt.figure(figsize=(8, 6))\n",
209 "plt.boxplot([dt_scores, rf_scores], labels=['Decision Tree', 'Random Forest'])\n",
210 "plt.ylabel('Accuracy')\n",
211 "plt.title('Model Comparison: Decision Tree vs Random Forest')\n",
212 "plt.grid(True, alpha=0.3)\n",
213 "plt.show()"
214 ]
215 },
216 {
217 "cell_type": "markdown",
218 "metadata": {},
219 "source": [
220 "## 6. ๋๋ค ํฌ๋ ์คํธ ํ๊ท"
221 ]
222 },
223 {
224 "cell_type": "code",
225 "execution_count": null,
226 "metadata": {},
227 "outputs": [],
228 "source": [
229 "# California Housing ๋ฐ์ดํฐ\n",
230 "housing = fetch_california_housing()\n",
231 "X_h, y_h = housing.data, housing.target\n",
232 "\n",
233 "X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(\n",
234 " X_h, y_h, test_size=0.2, random_state=42\n",
235 ")\n",
236 "\n",
237 "# ๋๋ค ํฌ๋ ์คํธ ํ๊ท\n",
238 "rf_reg = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)\n",
239 "rf_reg.fit(X_train_h, y_train_h)\n",
240 "\n",
241 "y_pred_h = rf_reg.predict(X_test_h)\n",
242 "\n",
243 "from sklearn.metrics import r2_score, mean_absolute_error\n",
244 "\n",
245 "print(f\"Rยฒ Score: {r2_score(y_test_h, y_pred_h):.4f}\")\n",
246 "print(f\"RMSE: {np.sqrt(mean_squared_error(y_test_h, y_pred_h)):.4f}\")\n",
247 "print(f\"MAE: {mean_absolute_error(y_test_h, y_pred_h):.4f}\")"
248 ]
249 },
250 {
251 "cell_type": "code",
252 "execution_count": null,
253 "metadata": {},
254 "outputs": [],
255 "source": [
256 "# ํน์ฑ ์ค์๋ (ํ๊ท)\n",
257 "importance_reg = pd.DataFrame({\n",
258 " 'Feature': housing.feature_names,\n",
259 " 'Importance': rf_reg.feature_importances_\n",
260 "}).sort_values('Importance', ascending=True)\n",
261 "\n",
262 "plt.figure(figsize=(10, 6))\n",
263 "plt.barh(importance_reg['Feature'], importance_reg['Importance'])\n",
264 "plt.xlabel('Feature Importance')\n",
265 "plt.title('Random Forest Regressor Feature Importance')\n",
266 "plt.grid(True, alpha=0.3)\n",
267 "plt.tight_layout()\n",
268 "plt.show()"
269 ]
270 },
271 {
272 "cell_type": "markdown",
273 "metadata": {},
274 "source": [
275 "## ์ ๋ฆฌ\n",
276 "\n",
277 "### ํต์ฌ ๊ฐ๋
\n",
278 "- **๋ฐฐ๊น
(Bagging)**: Bootstrap Aggregating, ์ฌ๋ฌ ๋ชจ๋ธ์ ์์ธก์ ํ๊ท /ํฌํ\n",
279 "- **๋๋ค ํน์ฑ ์ ํ**: ๊ฐ ๋ถํ ์์ ์ผ๋ถ ํน์ฑ๋ง ๊ณ ๋ ค\n",
280 "- **OOB (Out-of-Bag) Score**: ๋ถํธ์คํธ๋ฉ์ ํฌํจ๋์ง ์์ ์ํ๋ก ํ๊ฐ\n",
281 "\n",
282 "### ์ฃผ์ ํ์ดํผํ๋ผ๋ฏธํฐ\n",
283 "- `n_estimators`: ํธ๋ฆฌ ๊ฐ์ (๋ง์์๋ก ์ข์ง๋ง ์์ต ์ฒด๊ฐ)\n",
284 "- `max_depth`: ํธ๋ฆฌ ๊น์ด (๊ณผ์ ํฉ ๋ฐฉ์ง)\n",
285 "- `max_features`: ๋ถํ ์ ๊ณ ๋ คํ ํน์ฑ ์\n",
286 "- `min_samples_split`: ๋ถํ ์ ์ํ ์ต์ ์ํ ์\n",
287 "\n",
288 "### ๋ค์ ๋จ๊ณ\n",
289 "- Gradient Boosting (XGBoost, LightGBM)"
290 ]
291 }
292 ],
293 "metadata": {
294 "kernelspec": {
295 "display_name": "Python 3",
296 "language": "python",
297 "name": "python3"
298 },
299 "language_info": {
300 "name": "python",
301 "version": "3.10.0"
302 }
303 },
304 "nbformat": 4,
305 "nbformat_minor": 4
306}