07_random_forest.ipynb

Download
json 307 lines 8.8 KB
  1{
  2 "cells": [
  3  {
  4   "cell_type": "markdown",
  5   "metadata": {},
  6   "source": [
  7    "# 07. ๋žœ๋ค ํฌ๋ ˆ์ŠคํŠธ (Random Forest)\n",
  8    "\n",
  9    "## ํ•™์Šต ๋ชฉํ‘œ\n",
 10    "- ์•™์ƒ๋ธ” ํ•™์Šต๊ณผ ๋ฐฐ๊น… ์ดํ•ด\n",
 11    "- ๋žœ๋ค ํฌ๋ ˆ์ŠคํŠธ ์ž‘๋™ ์›๋ฆฌ\n",
 12    "- ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹"
 13   ]
 14  },
 15  {
 16   "cell_type": "code",
 17   "execution_count": null,
 18   "metadata": {},
 19   "outputs": [],
 20   "source": [
 21    "import numpy as np\n",
 22    "import pandas as pd\n",
 23    "import matplotlib.pyplot as plt\n",
 24    "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n",
 25    "from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV\n",
 26    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
 27    "from sklearn.datasets import load_iris, load_wine, fetch_california_housing\n",
 28    "import seaborn as sns\n",
 29    "\n",
 30    "plt.rcParams['font.family'] = 'DejaVu Sans'"
 31   ]
 32  },
 33  {
 34   "cell_type": "markdown",
 35   "metadata": {},
 36   "source": [
 37    "## 1. ๋žœ๋ค ํฌ๋ ˆ์ŠคํŠธ ๋ถ„๋ฅ˜"
 38   ]
 39  },
 40  {
 41   "cell_type": "code",
 42   "execution_count": null,
 43   "metadata": {},
 44   "outputs": [],
 45   "source": [
 46    "# Wine ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ\n",
 47    "wine = load_wine()\n",
 48    "X, y = wine.data, wine.target\n",
 49    "\n",
 50    "print(f\"Features: {wine.feature_names}\")\n",
 51    "print(f\"Classes: {wine.target_names}\")\n",
 52    "print(f\"Shape: {X.shape}\")\n",
 53    "\n",
 54    "X_train, X_test, y_train, y_test = train_test_split(\n",
 55    "    X, y, test_size=0.3, random_state=42\n",
 56    ")"
 57   ]
 58  },
 59  {
 60   "cell_type": "code",
 61   "execution_count": null,
 62   "metadata": {},
 63   "outputs": [],
 64   "source": [
 65    "# ๋žœ๋ค ํฌ๋ ˆ์ŠคํŠธ ๋ชจ๋ธ\n",
 66    "rf_clf = RandomForestClassifier(\n",
 67    "    n_estimators=100,\n",
 68    "    max_depth=5,\n",
 69    "    random_state=42,\n",
 70    "    n_jobs=-1\n",
 71    ")\n",
 72    "rf_clf.fit(X_train, y_train)\n",
 73    "\n",
 74    "y_pred = rf_clf.predict(X_test)\n",
 75    "print(f\"Accuracy: {accuracy_score(y_test, y_pred):.4f}\")\n",
 76    "print(f\"\\nClassification Report:\")\n",
 77    "print(classification_report(y_test, y_pred, target_names=wine.target_names))"
 78   ]
 79  },
 80  {
 81   "cell_type": "markdown",
 82   "metadata": {},
 83   "source": [
 84    "## 2. ํŠธ๋ฆฌ ๊ฐœ์ˆ˜์— ๋”ฐ๋ฅธ ์„ฑ๋Šฅ"
 85   ]
 86  },
 87  {
 88   "cell_type": "code",
 89   "execution_count": null,
 90   "metadata": {},
 91   "outputs": [],
 92   "source": [
 93    "# ํŠธ๋ฆฌ ๊ฐœ์ˆ˜์— ๋”ฐ๋ฅธ ์„ฑ๋Šฅ ๋ณ€ํ™”\n",
 94    "n_trees = [1, 5, 10, 20, 50, 100, 200, 500]\n",
 95    "train_scores = []\n",
 96    "test_scores = []\n",
 97    "oob_scores = []\n",
 98    "\n",
 99    "for n in n_trees:\n",
100    "    rf = RandomForestClassifier(n_estimators=n, random_state=42, oob_score=True)\n",
101    "    rf.fit(X_train, y_train)\n",
102    "    train_scores.append(rf.score(X_train, y_train))\n",
103    "    test_scores.append(rf.score(X_test, y_test))\n",
104    "    oob_scores.append(rf.oob_score_)\n",
105    "\n",
106    "plt.figure(figsize=(10, 6))\n",
107    "plt.plot(n_trees, train_scores, 'b-o', label='Train Score')\n",
108    "plt.plot(n_trees, test_scores, 'r-o', label='Test Score')\n",
109    "plt.plot(n_trees, oob_scores, 'g-o', label='OOB Score')\n",
110    "plt.xlabel('Number of Trees')\n",
111    "plt.ylabel('Accuracy')\n",
112    "plt.title('Random Forest: Performance vs Number of Trees')\n",
113    "plt.legend()\n",
114    "plt.grid(True, alpha=0.3)\n",
115    "plt.xscale('log')\n",
116    "plt.show()"
117   ]
118  },
119  {
120   "cell_type": "markdown",
121   "metadata": {},
122   "source": [
123    "## 3. ํŠน์„ฑ ์ค‘์š”๋„"
124   ]
125  },
126  {
127   "cell_type": "code",
128   "execution_count": null,
129   "metadata": {},
130   "outputs": [],
131   "source": [
132    "# ํŠน์„ฑ ์ค‘์š”๋„\n",
133    "importance = pd.DataFrame({\n",
134    "    'Feature': wine.feature_names,\n",
135    "    'Importance': rf_clf.feature_importances_\n",
136    "}).sort_values('Importance', ascending=True)\n",
137    "\n",
138    "plt.figure(figsize=(12, 8))\n",
139    "plt.barh(importance['Feature'], importance['Importance'])\n",
140    "plt.xlabel('Feature Importance')\n",
141    "plt.title('Random Forest Feature Importance - Wine Dataset')\n",
142    "plt.grid(True, alpha=0.3)\n",
143    "plt.tight_layout()\n",
144    "plt.show()"
145   ]
146  },
147  {
148   "cell_type": "markdown",
149   "metadata": {},
150   "source": [
151    "## 4. ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹"
152   ]
153  },
154  {
155   "cell_type": "code",
156   "execution_count": null,
157   "metadata": {},
158   "outputs": [],
159   "source": [
160    "param_grid = {\n",
161    "    'n_estimators': [50, 100, 200],\n",
162    "    'max_depth': [3, 5, 7, None],\n",
163    "    'min_samples_split': [2, 5, 10],\n",
164    "    'max_features': ['sqrt', 'log2', None]\n",
165    "}\n",
166    "\n",
167    "grid_search = GridSearchCV(\n",
168    "    RandomForestClassifier(random_state=42, n_jobs=-1),\n",
169    "    param_grid,\n",
170    "    cv=5,\n",
171    "    scoring='accuracy',\n",
172    "    n_jobs=-1\n",
173    ")\n",
174    "\n",
175    "grid_search.fit(X_train, y_train)\n",
176    "\n",
177    "print(f\"Best Parameters: {grid_search.best_params_}\")\n",
178    "print(f\"Best CV Score: {grid_search.best_score_:.4f}\")\n",
179    "print(f\"Test Score: {grid_search.score(X_test, y_test):.4f}\")"
180   ]
181  },
182  {
183   "cell_type": "markdown",
184   "metadata": {},
185   "source": [
186    "## 5. ๊ฒฐ์ • ํŠธ๋ฆฌ vs ๋žœ๋ค ํฌ๋ ˆ์ŠคํŠธ ๋น„๊ต"
187   ]
188  },
189  {
190   "cell_type": "code",
191   "execution_count": null,
192   "metadata": {},
193   "outputs": [],
194   "source": [
195    "from sklearn.tree import DecisionTreeClassifier\n",
196    "\n",
197    "# ๋‘ ๋ชจ๋ธ ๋น„๊ต\n",
198    "dt = DecisionTreeClassifier(max_depth=5, random_state=42)\n",
199    "rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)\n",
200    "\n",
201    "dt_scores = cross_val_score(dt, X, y, cv=10)\n",
202    "rf_scores = cross_val_score(rf, X, y, cv=10)\n",
203    "\n",
204    "print(f\"Decision Tree: {dt_scores.mean():.4f} (+/- {dt_scores.std()*2:.4f})\")\n",
205    "print(f\"Random Forest: {rf_scores.mean():.4f} (+/- {rf_scores.std()*2:.4f})\")\n",
206    "\n",
207    "# ๋ฐ•์Šคํ”Œ๋กฏ ๋น„๊ต\n",
208    "plt.figure(figsize=(8, 6))\n",
209    "plt.boxplot([dt_scores, rf_scores], labels=['Decision Tree', 'Random Forest'])\n",
210    "plt.ylabel('Accuracy')\n",
211    "plt.title('Model Comparison: Decision Tree vs Random Forest')\n",
212    "plt.grid(True, alpha=0.3)\n",
213    "plt.show()"
214   ]
215  },
216  {
217   "cell_type": "markdown",
218   "metadata": {},
219   "source": [
220    "## 6. ๋žœ๋ค ํฌ๋ ˆ์ŠคํŠธ ํšŒ๊ท€"
221   ]
222  },
223  {
224   "cell_type": "code",
225   "execution_count": null,
226   "metadata": {},
227   "outputs": [],
228   "source": [
229    "# California Housing ๋ฐ์ดํ„ฐ\n",
230    "housing = fetch_california_housing()\n",
231    "X_h, y_h = housing.data, housing.target\n",
232    "\n",
233    "X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(\n",
234    "    X_h, y_h, test_size=0.2, random_state=42\n",
235    ")\n",
236    "\n",
237    "# ๋žœ๋ค ํฌ๋ ˆ์ŠคํŠธ ํšŒ๊ท€\n",
238    "rf_reg = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)\n",
239    "rf_reg.fit(X_train_h, y_train_h)\n",
240    "\n",
241    "y_pred_h = rf_reg.predict(X_test_h)\n",
242    "\n",
243    "from sklearn.metrics import r2_score, mean_absolute_error\n",
244    "\n",
245    "print(f\"Rยฒ Score: {r2_score(y_test_h, y_pred_h):.4f}\")\n",
246    "print(f\"RMSE: {np.sqrt(mean_squared_error(y_test_h, y_pred_h)):.4f}\")\n",
247    "print(f\"MAE: {mean_absolute_error(y_test_h, y_pred_h):.4f}\")"
248   ]
249  },
250  {
251   "cell_type": "code",
252   "execution_count": null,
253   "metadata": {},
254   "outputs": [],
255   "source": [
256    "# ํŠน์„ฑ ์ค‘์š”๋„ (ํšŒ๊ท€)\n",
257    "importance_reg = pd.DataFrame({\n",
258    "    'Feature': housing.feature_names,\n",
259    "    'Importance': rf_reg.feature_importances_\n",
260    "}).sort_values('Importance', ascending=True)\n",
261    "\n",
262    "plt.figure(figsize=(10, 6))\n",
263    "plt.barh(importance_reg['Feature'], importance_reg['Importance'])\n",
264    "plt.xlabel('Feature Importance')\n",
265    "plt.title('Random Forest Regressor Feature Importance')\n",
266    "plt.grid(True, alpha=0.3)\n",
267    "plt.tight_layout()\n",
268    "plt.show()"
269   ]
270  },
271  {
272   "cell_type": "markdown",
273   "metadata": {},
274   "source": [
275    "## ์ •๋ฆฌ\n",
276    "\n",
277    "### ํ•ต์‹ฌ ๊ฐœ๋…\n",
278    "- **๋ฐฐ๊น… (Bagging)**: Bootstrap Aggregating, ์—ฌ๋Ÿฌ ๋ชจ๋ธ์˜ ์˜ˆ์ธก์„ ํ‰๊ท /ํˆฌํ‘œ\n",
279    "- **๋žœ๋ค ํŠน์„ฑ ์„ ํƒ**: ๊ฐ ๋ถ„ํ• ์—์„œ ์ผ๋ถ€ ํŠน์„ฑ๋งŒ ๊ณ ๋ ค\n",
280    "- **OOB (Out-of-Bag) Score**: ๋ถ€ํŠธ์ŠคํŠธ๋žฉ์— ํฌํ•จ๋˜์ง€ ์•Š์€ ์ƒ˜ํ”Œ๋กœ ํ‰๊ฐ€\n",
281    "\n",
282    "### ์ฃผ์š” ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ\n",
283    "- `n_estimators`: ํŠธ๋ฆฌ ๊ฐœ์ˆ˜ (๋งŽ์„์ˆ˜๋ก ์ข‹์ง€๋งŒ ์ˆ˜์ต ์ฒด๊ฐ)\n",
284    "- `max_depth`: ํŠธ๋ฆฌ ๊นŠ์ด (๊ณผ์ ํ•ฉ ๋ฐฉ์ง€)\n",
285    "- `max_features`: ๋ถ„ํ•  ์‹œ ๊ณ ๋ คํ•  ํŠน์„ฑ ์ˆ˜\n",
286    "- `min_samples_split`: ๋ถ„ํ• ์„ ์œ„ํ•œ ์ตœ์†Œ ์ƒ˜ํ”Œ ์ˆ˜\n",
287    "\n",
288    "### ๋‹ค์Œ ๋‹จ๊ณ„\n",
289    "- Gradient Boosting (XGBoost, LightGBM)"
290   ]
291  }
292 ],
293 "metadata": {
294  "kernelspec": {
295   "display_name": "Python 3",
296   "language": "python",
297   "name": "python3"
298  },
299  "language_info": {
300   "name": "python",
301   "version": "3.10.0"
302  }
303 },
304 "nbformat": 4,
305 "nbformat_minor": 4
306}