07_random_forest.ipynb

  1{
  2 "cells": [
  3  {
  4   "cell_type": "markdown",
  5   "metadata": {},
  6   "source": [
  7    "# 07. 랜덤 포레스트 (Random Forest)\n",
  8    "\n",
  9    "## 학습 목표\n",
 10    "- 앙상블 학습과 배깅 이해\n",
 11    "- 랜덤 포레스트 작동 원리\n",
 12    "- 하이퍼파라미터 튜닝"
 13   ]
 14  },
 15  {
 16   "cell_type": "code",
 17   "execution_count": null,
 18   "metadata": {},
 19   "outputs": [],
 20   "source": [
 21    "import numpy as np\n",
 22    "import pandas as pd\n",
 23    "import matplotlib.pyplot as plt\n",
 24    "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n",
 25    "from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV\n",
 26    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
 27    "from sklearn.datasets import load_iris, load_wine, fetch_california_housing\n",
 28    "import seaborn as sns\n",
 29    "\n",
 30    "plt.rcParams['font.family'] = 'DejaVu Sans'"
 31   ]
 32  },
 33  {
 34   "cell_type": "markdown",
 35   "metadata": {},
 36   "source": [
 37    "## 1. 랜덤 포레스트 분류"
 38   ]
 39  },
 40  {
 41   "cell_type": "code",
 42   "execution_count": null,
 43   "metadata": {},
 44   "outputs": [],
 45   "source": [
 46    "# Wine 데이터셋 로드\n",
 47    "wine = load_wine()\n",
 48    "X, y = wine.data, wine.target\n",
 49    "\n",
 50    "print(f\"Features: {wine.feature_names}\")\n",
 51    "print(f\"Classes: {wine.target_names}\")\n",
 52    "print(f\"Shape: {X.shape}\")\n",
 53    "\n",
 54    "X_train, X_test, y_train, y_test = train_test_split(\n",
 55    "    X, y, test_size=0.3, random_state=42\n",
 56    ")"
 57   ]
 58  },
 59  {
 60   "cell_type": "code",
 61   "execution_count": null,
 62   "metadata": {},
 63   "outputs": [],
 64   "source": [
 65    "# 랜덤 포레스트 모델\n",
 66    "rf_clf = RandomForestClassifier(\n",
 67    "    n_estimators=100,\n",
 68    "    max_depth=5,\n",
 69    "    random_state=42,\n",
 70    "    n_jobs=-1\n",
 71    ")\n",
 72    "rf_clf.fit(X_train, y_train)\n",
 73    "\n",
 74    "y_pred = rf_clf.predict(X_test)\n",
 75    "print(f\"Accuracy: {accuracy_score(y_test, y_pred):.4f}\")\n",
 76    "print(f\"\\nClassification Report:\")\n",
 77    "print(classification_report(y_test, y_pred, target_names=wine.target_names))"
 78   ]
 79  },
 80  {
 81   "cell_type": "markdown",
 82   "metadata": {},
 83   "source": [
 84    "## 2. 트리 개수에 따른 성능"
 85   ]
 86  },
 87  {
 88   "cell_type": "code",
 89   "execution_count": null,
 90   "metadata": {},
 91   "outputs": [],
 92   "source": [
 93    "# 트리 개수에 따른 성능 변화\n",
 94    "n_trees = [1, 5, 10, 20, 50, 100, 200, 500]\n",
 95    "train_scores = []\n",
 96    "test_scores = []\n",
 97    "oob_scores = []\n",
 98    "\n",
 99    "for n in n_trees:\n",
100    "    rf = RandomForestClassifier(n_estimators=n, random_state=42, oob_score=True)\n",
101    "    rf.fit(X_train, y_train)\n",
102    "    train_scores.append(rf.score(X_train, y_train))\n",
103    "    test_scores.append(rf.score(X_test, y_test))\n",
104    "    oob_scores.append(rf.oob_score_)\n",
105    "\n",
106    "plt.figure(figsize=(10, 6))\n",
107    "plt.plot(n_trees, train_scores, 'b-o', label='Train Score')\n",
108    "plt.plot(n_trees, test_scores, 'r-o', label='Test Score')\n",
109    "plt.plot(n_trees, oob_scores, 'g-o', label='OOB Score')\n",
110    "plt.xlabel('Number of Trees')\n",
111    "plt.ylabel('Accuracy')\n",
112    "plt.title('Random Forest: Performance vs Number of Trees')\n",
113    "plt.legend()\n",
114    "plt.grid(True, alpha=0.3)\n",
115    "plt.xscale('log')\n",
116    "plt.show()"
117   ]
118  },
119  {
120   "cell_type": "markdown",
121   "metadata": {},
122   "source": [
123    "## 3. 특성 중요도"
124   ]
125  },
126  {
127   "cell_type": "code",
128   "execution_count": null,
129   "metadata": {},
130   "outputs": [],
131   "source": [
132    "# 특성 중요도\n",
133    "importance = pd.DataFrame({\n",
134    "    'Feature': wine.feature_names,\n",
135    "    'Importance': rf_clf.feature_importances_\n",
136    "}).sort_values('Importance', ascending=True)\n",
137    "\n",
138    "plt.figure(figsize=(12, 8))\n",
139    "plt.barh(importance['Feature'], importance['Importance'])\n",
140    "plt.xlabel('Feature Importance')\n",
141    "plt.title('Random Forest Feature Importance - Wine Dataset')\n",
142    "plt.grid(True, alpha=0.3)\n",
143    "plt.tight_layout()\n",
144    "plt.show()"
145   ]
146  },
147  {
148   "cell_type": "markdown",
149   "metadata": {},
150   "source": [
151    "## 4. 하이퍼파라미터 튜닝"
152   ]
153  },
154  {
155   "cell_type": "code",
156   "execution_count": null,
157   "metadata": {},
158   "outputs": [],
159   "source": [
160    "param_grid = {\n",
161    "    'n_estimators': [50, 100, 200],\n",
162    "    'max_depth': [3, 5, 7, None],\n",
163    "    'min_samples_split': [2, 5, 10],\n",
164    "    'max_features': ['sqrt', 'log2', None]\n",
165    "}\n",
166    "\n",
167    "grid_search = GridSearchCV(\n",
168    "    RandomForestClassifier(random_state=42, n_jobs=-1),\n",
169    "    param_grid,\n",
170    "    cv=5,\n",
171    "    scoring='accuracy',\n",
172    "    n_jobs=-1\n",
173    ")\n",
174    "\n",
175    "grid_search.fit(X_train, y_train)\n",
176    "\n",
177    "print(f\"Best Parameters: {grid_search.best_params_}\")\n",
178    "print(f\"Best CV Score: {grid_search.best_score_:.4f}\")\n",
179    "print(f\"Test Score: {grid_search.score(X_test, y_test):.4f}\")"
180   ]
181  },
182  {
183   "cell_type": "markdown",
184   "metadata": {},
185   "source": [
186    "## 5. 결정 트리 vs 랜덤 포레스트 비교"
187   ]
188  },
189  {
190   "cell_type": "code",
191   "execution_count": null,
192   "metadata": {},
193   "outputs": [],
194   "source": [
195    "from sklearn.tree import DecisionTreeClassifier\n",
196    "\n",
197    "# 두 모델 비교\n",
198    "dt = DecisionTreeClassifier(max_depth=5, random_state=42)\n",
199    "rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)\n",
200    "\n",
201    "dt_scores = cross_val_score(dt, X, y, cv=10)\n",
202    "rf_scores = cross_val_score(rf, X, y, cv=10)\n",
203    "\n",
204    "print(f\"Decision Tree: {dt_scores.mean():.4f} (+/- {dt_scores.std()*2:.4f})\")\n",
205    "print(f\"Random Forest: {rf_scores.mean():.4f} (+/- {rf_scores.std()*2:.4f})\")\n",
206    "\n",
207    "# 박스플롯 비교\n",
208    "plt.figure(figsize=(8, 6))\n",
209    "plt.boxplot([dt_scores, rf_scores], labels=['Decision Tree', 'Random Forest'])\n",
210    "plt.ylabel('Accuracy')\n",
211    "plt.title('Model Comparison: Decision Tree vs Random Forest')\n",
212    "plt.grid(True, alpha=0.3)\n",
213    "plt.show()"
214   ]
215  },
216  {
217   "cell_type": "markdown",
218   "metadata": {},
219   "source": [
220    "## 6. 랜덤 포레스트 회귀"
221   ]
222  },
223  {
224   "cell_type": "code",
225   "execution_count": null,
226   "metadata": {},
227   "outputs": [],
228   "source": [
229    "# California Housing 데이터\n",
230    "housing = fetch_california_housing()\n",
231    "X_h, y_h = housing.data, housing.target\n",
232    "\n",
233    "X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(\n",
234    "    X_h, y_h, test_size=0.2, random_state=42\n",
235    ")\n",
236    "\n",
237    "# 랜덤 포레스트 회귀\n",
238    "rf_reg = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)\n",
239    "rf_reg.fit(X_train_h, y_train_h)\n",
240    "\n",
241    "y_pred_h = rf_reg.predict(X_test_h)\n",
242    "\n",
243    "from sklearn.metrics import r2_score, mean_absolute_error\n",
244    "\n",
245    "print(f\"R² Score: {r2_score(y_test_h, y_pred_h):.4f}\")\n",
246    "print(f\"RMSE: {np.sqrt(mean_squared_error(y_test_h, y_pred_h)):.4f}\")\n",
247    "print(f\"MAE: {mean_absolute_error(y_test_h, y_pred_h):.4f}\")"
248   ]
249  },
250  {
251   "cell_type": "code",
252   "execution_count": null,
253   "metadata": {},
254   "outputs": [],
255   "source": [
256    "# 특성 중요도 (회귀)\n",
257    "importance_reg = pd.DataFrame({\n",
258    "    'Feature': housing.feature_names,\n",
259    "    'Importance': rf_reg.feature_importances_\n",
260    "}).sort_values('Importance', ascending=True)\n",
261    "\n",
262    "plt.figure(figsize=(10, 6))\n",
263    "plt.barh(importance_reg['Feature'], importance_reg['Importance'])\n",
264    "plt.xlabel('Feature Importance')\n",
265    "plt.title('Random Forest Regressor Feature Importance')\n",
266    "plt.grid(True, alpha=0.3)\n",
267    "plt.tight_layout()\n",
268    "plt.show()"
269   ]
270  },
271  {
272   "cell_type": "markdown",
273   "metadata": {},
274   "source": [
275    "## 정리\n",
276    "\n",
277    "### 핵심 개념\n",
278    "- **배깅 (Bagging)**: Bootstrap Aggregating, 여러 모델의 예측을 평균/투표\n",
279    "- **랜덤 특성 선택**: 각 분할에서 일부 특성만 고려\n",
280    "- **OOB (Out-of-Bag) Score**: 부트스트랩에 포함되지 않은 샘플로 평가\n",
281    "\n",
282    "### 주요 하이퍼파라미터\n",
283    "- `n_estimators`: 트리 개수 (많을수록 좋지만 수익 체감)\n",
284    "- `max_depth`: 트리 깊이 (과적합 방지)\n",
285    "- `max_features`: 분할 시 고려할 특성 수\n",
286    "- `min_samples_split`: 분할을 위한 최소 샘플 수\n",
287    "\n",
288    "### 다음 단계\n",
289    "- Gradient Boosting (XGBoost, LightGBM)"
290   ]
291  }
292 ],
293 "metadata": {
294  "kernelspec": {
295   "display_name": "Python 3",
296   "language": "python",
297   "name": "python3"
298  },
299  "language_info": {
300   "name": "python",
301   "version": "3.10.0"
302  }
303 },
304 "nbformat": 4,
305 "nbformat_minor": 4
306}