04_cross_validation.ipynb

Download
json 920 lines 30.6 KB
  1{
  2 "cells": [
  3  {
  4   "cell_type": "markdown",
  5   "metadata": {},
  6   "source": [
  7    "# ๊ต์ฐจ๊ฒ€์ฆ๊ณผ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹\n",
  8    "\n",
  9    "์ด ๋…ธํŠธ๋ถ์—์„œ๋Š” ๋ชจ๋ธ์˜ ์ผ๋ฐ˜ํ™” ์„ฑ๋Šฅ์„ ํ‰๊ฐ€ํ•˜๋Š” ๊ต์ฐจ๊ฒ€์ฆ๊ณผ ์ตœ์ ์˜ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ๋ฅผ ์ฐพ๋Š” ๋ฐฉ๋ฒ•์„ ํ•™์Šตํ•ฉ๋‹ˆ๋‹ค.\n",
 10    "\n",
 11    "## ๋ชฉ์ฐจ\n",
 12    "1. ๊ต์ฐจ๊ฒ€์ฆ (Cross-Validation)\n",
 13    "   - K-Fold Cross-Validation\n",
 14    "   - Stratified K-Fold\n",
 15    "   - Leave-One-Out CV\n",
 16    "   - Time Series Split\n",
 17    "2. ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹\n",
 18    "   - Grid Search\n",
 19    "   - Randomized Search\n",
 20    "3. ๊ณ ๊ธ‰ ๊ธฐ๋ฒ•\n",
 21    "   - Nested Cross-Validation\n",
 22    "   - Learning Curves"
 23   ]
 24  },
 25  {
 26   "cell_type": "code",
 27   "execution_count": null,
 28   "metadata": {},
 29   "outputs": [],
 30   "source": [
 31    "# ํ•„์š”ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ\n",
 32    "import numpy as np\n",
 33    "import pandas as pd\n",
 34    "import matplotlib.pyplot as plt\n",
 35    "import seaborn as sns\n",
 36    "from sklearn.datasets import load_iris, load_breast_cancer, load_diabetes\n",
 37    "from sklearn.model_selection import (\n",
 38    "    cross_val_score, cross_validate,\n",
 39    "    KFold, StratifiedKFold, LeaveOneOut, ShuffleSplit,\n",
 40    "    TimeSeriesSplit, RepeatedKFold,\n",
 41    "    GridSearchCV, RandomizedSearchCV,\n",
 42    "    learning_curve, validation_curve\n",
 43    ")\n",
 44    "from sklearn.linear_model import LogisticRegression, LinearRegression\n",
 45    "from sklearn.svm import SVC\n",
 46    "from sklearn.ensemble import RandomForestClassifier\n",
 47    "from sklearn.preprocessing import StandardScaler\n",
 48    "from sklearn.pipeline import Pipeline\n",
 49    "from sklearn.metrics import make_scorer, f1_score, accuracy_score\n",
 50    "from scipy.stats import uniform, randint\n",
 51    "\n",
 52    "# ์‹œ๊ฐํ™” ์„ค์ •\n",
 53    "plt.rcParams['figure.figsize'] = (10, 6)\n",
 54    "plt.rcParams['font.family'] = 'AppleGothic'  # MacOS์šฉ ํ•œ๊ธ€ ํฐํŠธ\n",
 55    "plt.rcParams['axes.unicode_minus'] = False\n",
 56    "sns.set_style('whitegrid')\n",
 57    "\n",
 58    "# ๊ฒฝ๊ณ  ๋ฌด์‹œ\n",
 59    "import warnings\n",
 60    "warnings.filterwarnings('ignore')"
 61   ]
 62  },
 63  {
 64   "cell_type": "markdown",
 65   "metadata": {},
 66   "source": [
 67    "## 1. ๊ต์ฐจ๊ฒ€์ฆ (Cross-Validation)\n",
 68    "\n",
 69    "### 1.1 K-Fold Cross-Validation"
 70   ]
 71  },
 72  {
 73   "cell_type": "code",
 74   "execution_count": null,
 75   "metadata": {},
 76   "outputs": [],
 77   "source": [
 78    "# ๋ฐ์ดํ„ฐ ๋กœ๋“œ\n",
 79    "iris = load_iris()\n",
 80    "X, y = iris.data, iris.target\n",
 81    "\n",
 82    "# ๋ชจ๋ธ ์ƒ์„ฑ\n",
 83    "model = LogisticRegression(max_iter=1000, random_state=42)\n",
 84    "\n",
 85    "# K-Fold ๊ต์ฐจ๊ฒ€์ฆ (K=5)\n",
 86    "scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')\n",
 87    "\n",
 88    "print(\"=== K-Fold ๊ต์ฐจ๊ฒ€์ฆ (K=5) ===\")\n",
 89    "print(f\"๊ฐ ํด๋“œ ์ ์ˆ˜: {scores}\")\n",
 90    "print(f\"ํ‰๊ท  ์ •ํ™•๋„: {scores.mean():.4f}\")\n",
 91    "print(f\"ํ‘œ์ค€ํŽธ์ฐจ: {scores.std():.4f}\")\n",
 92    "print(f\"95% ์‹ ๋ขฐ๊ตฌ๊ฐ„: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})\")"
 93   ]
 94  },
 95  {
 96   "cell_type": "code",
 97   "execution_count": null,
 98   "metadata": {},
 99   "outputs": [],
100   "source": [
101    "# K-Fold ์ˆ˜๋™ ๊ตฌํ˜„์œผ๋กœ ์ดํ•ดํ•˜๊ธฐ\n",
102    "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n",
103    "\n",
104    "print(\"\\n=== K-Fold ๋ถ„ํ•  ์‹œ๊ฐํ™” ===\")\n",
105    "fold_scores = []\n",
106    "\n",
107    "for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):\n",
108    "    # ๋ฐ์ดํ„ฐ ๋ถ„ํ• \n",
109    "    X_train, X_val = X[train_idx], X[val_idx]\n",
110    "    y_train, y_val = y[train_idx], y[val_idx]\n",
111    "    \n",
112    "    # ๋ชจ๋ธ ํ•™์Šต ๋ฐ ํ‰๊ฐ€\n",
113    "    model_fold = LogisticRegression(max_iter=1000, random_state=42)\n",
114    "    model_fold.fit(X_train, y_train)\n",
115    "    score = model_fold.score(X_val, y_val)\n",
116    "    fold_scores.append(score)\n",
117    "    \n",
118    "    print(f\"Fold {fold}: Train={len(train_idx)}, Val={len(val_idx)}, Accuracy={score:.4f}\")\n",
119    "\n",
120    "print(f\"\\nํ‰๊ท  ์ •ํ™•๋„: {np.mean(fold_scores):.4f}\")"
121   ]
122  },
123  {
124   "cell_type": "code",
125   "execution_count": null,
126   "metadata": {},
127   "outputs": [],
128   "source": [
129    "# K ๊ฐ’์— ๋”ฐ๋ฅธ ์„ฑ๋Šฅ ๋น„๊ต\n",
130    "k_values = [3, 5, 10, 15, 20]\n",
131    "mean_scores = []\n",
132    "std_scores = []\n",
133    "\n",
134    "for k in k_values:\n",
135    "    scores = cross_val_score(model, X, y, cv=k, scoring='accuracy')\n",
136    "    mean_scores.append(scores.mean())\n",
137    "    std_scores.append(scores.std())\n",
138    "\n",
139    "# ์‹œ๊ฐํ™”\n",
140    "plt.figure(figsize=(10, 6))\n",
141    "plt.errorbar(k_values, mean_scores, yerr=std_scores, marker='o', \n",
142    "             capsize=5, capthick=2, linewidth=2)\n",
143    "plt.xlabel('K (Number of Folds)', fontsize=12)\n",
144    "plt.ylabel('Mean Accuracy', fontsize=12)\n",
145    "plt.title('K-Fold CV Performance vs K Value', fontsize=14, pad=20)\n",
146    "plt.grid(True, alpha=0.3)\n",
147    "plt.show()\n",
148    "\n",
149    "print(\"\\n=== K ๊ฐ’์— ๋”ฐ๋ฅธ ์„ฑ๋Šฅ ===\")\n",
150    "for k, mean, std in zip(k_values, mean_scores, std_scores):\n",
151    "    print(f\"K={k:2d}: {mean:.4f} (+/- {std:.4f})\")"
152   ]
153  },
154  {
155   "cell_type": "markdown",
156   "metadata": {},
157   "source": [
158    "### 1.2 Stratified K-Fold (๊ณ„์ธตํ™” K-Fold)"
159   ]
160  },
161  {
162   "cell_type": "code",
163   "execution_count": null,
164   "metadata": {},
165   "outputs": [],
166   "source": [
167    "# ํด๋ž˜์Šค ๋น„์œจ ํ™•์ธ\n",
168    "unique, counts = np.unique(y, return_counts=True)\n",
169    "print(\"์ „์ฒด ๋ฐ์ดํ„ฐ ํด๋ž˜์Šค ๋ถ„ํฌ:\")\n",
170    "for cls, cnt in zip(unique, counts):\n",
171    "    print(f\"  Class {cls}: {cnt} ({cnt/len(y)*100:.1f}%)\")\n",
172    "\n",
173    "# Stratified K-Fold\n",
174    "skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n",
175    "\n",
176    "print(\"\\n=== Stratified K-Fold ๊ฐ ํด๋“œ์˜ ํด๋ž˜์Šค ๋ถ„ํฌ ===\")\n",
177    "for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):\n",
178    "    train_classes = np.bincount(y[train_idx])\n",
179    "    val_classes = np.bincount(y[val_idx])\n",
180    "    \n",
181    "    print(f\"Fold {fold}:\")\n",
182    "    print(f\"  Train: {train_classes} ({train_classes/train_classes.sum()*100})\")\n",
183    "    print(f\"  Val:   {val_classes} ({val_classes/val_classes.sum()*100})\")"
184   ]
185  },
186  {
187   "cell_type": "code",
188   "execution_count": null,
189   "metadata": {},
190   "outputs": [],
191   "source": [
192    "# K-Fold vs Stratified K-Fold ์„ฑ๋Šฅ ๋น„๊ต\n",
193    "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n",
194    "skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n",
195    "\n",
196    "scores_kf = cross_val_score(model, X, y, cv=kf, scoring='accuracy')\n",
197    "scores_skf = cross_val_score(model, X, y, cv=skf, scoring='accuracy')\n",
198    "\n",
199    "print(\"=== K-Fold vs Stratified K-Fold ===\")\n",
200    "print(f\"K-Fold:           {scores_kf.mean():.4f} (+/- {scores_kf.std():.4f})\")\n",
201    "print(f\"Stratified K-Fold: {scores_skf.mean():.4f} (+/- {scores_skf.std():.4f})\")\n",
202    "print(\"\\n๋ถˆ๊ท ํ˜• ๋ฐ์ดํ„ฐ์—์„œ๋Š” Stratified K-Fold๊ฐ€ ๋” ์•ˆ์ •์ ์ž…๋‹ˆ๋‹ค.\")"
203   ]
204  },
205  {
206   "cell_type": "markdown",
207   "metadata": {},
208   "source": [
209    "### 1.3 ๋‹ค์–‘ํ•œ ๊ต์ฐจ๊ฒ€์ฆ ๋ฐฉ๋ฒ•"
210   ]
211  },
212  {
213   "cell_type": "code",
214   "execution_count": null,
215   "metadata": {},
216   "outputs": [],
217   "source": [
218    "# Leave-One-Out (LOO)\n",
219    "loo = LeaveOneOut()\n",
220    "print(f\"Leave-One-Out ๋ถ„ํ•  ์ˆ˜: {loo.get_n_splits(X)} (๋ฐ์ดํ„ฐ ์ˆ˜์™€ ๋™์ผ)\")\n",
221    "print(\"LOO๋Š” ์ž‘์€ ๋ฐ์ดํ„ฐ์…‹์—์„œ ์œ ์šฉํ•˜์ง€๋งŒ ๊ณ„์‚ฐ ๋น„์šฉ์ด ๋†’์Šต๋‹ˆ๋‹ค.\\n\")\n",
222    "\n",
223    "# Shuffle Split (๋žœ๋ค ๋ถ„ํ• )\n",
224    "ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)\n",
225    "scores_ss = cross_val_score(model, X, y, cv=ss, scoring='accuracy')\n",
226    "print(f\"Shuffle Split ํ‰๊ท : {scores_ss.mean():.4f} (+/- {scores_ss.std():.4f})\")\n",
227    "print(\"๊ฐ ๋ถ„ํ• ์ด ๋…๋ฆฝ์ ์œผ๋กœ ๋žœ๋ค ์ƒ˜ํ”Œ๋ง๋ฉ๋‹ˆ๋‹ค.\\n\")\n",
228    "\n",
229    "# Repeated K-Fold (๋ฐ˜๋ณต)\n",
230    "rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=42)\n",
231    "scores_rkf = cross_val_score(model, X, y, cv=rkf, scoring='accuracy')\n",
232    "print(f\"Repeated K-Fold ํ‰๊ท : {scores_rkf.mean():.4f} (+/- {scores_rkf.std():.4f})\")\n",
233    "print(f\"์ด ๋ถ„ํ•  ์ˆ˜: {len(scores_rkf)} (5 folds ร— 10 repeats = 50)\")\n",
234    "print(\"๋” ์•ˆ์ •์ ์ธ ์ถ”์ •์„ ์œ„ํ•ด ์—ฌ๋Ÿฌ ๋ฒˆ ๋ฐ˜๋ณตํ•ฉ๋‹ˆ๋‹ค.\")"
235   ]
236  },
237  {
238   "cell_type": "markdown",
239   "metadata": {},
240   "source": [
241    "### 1.4 ์‹œ๊ณ„์—ด ๊ต์ฐจ๊ฒ€์ฆ (Time Series Split)"
242   ]
243  },
244  {
245   "cell_type": "code",
246   "execution_count": null,
247   "metadata": {},
248   "outputs": [],
249   "source": [
250    "# ์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ์šฉ ๊ต์ฐจ๊ฒ€์ฆ\n",
251    "tscv = TimeSeriesSplit(n_splits=5)\n",
252    "\n",
253    "print(\"=== Time Series Split ===\")\n",
254    "print(\"์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ๋Š” ๊ณผ๊ฑฐ โ†’ ๋ฏธ๋ž˜ ์ˆœ์„œ๋ฅผ ์œ ์ง€ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.\\n\")\n",
255    "\n",
256    "for fold, (train_idx, test_idx) in enumerate(tscv.split(X), 1):\n",
257    "    print(f\"Fold {fold}:\")\n",
258    "    print(f\"  Train: [{train_idx[0]:3d}:{train_idx[-1]:3d}] ({len(train_idx)} samples)\")\n",
259    "    print(f\"  Test:  [{test_idx[0]:3d}:{test_idx[-1]:3d}] ({len(test_idx)} samples)\")"
260   ]
261  },
262  {
263   "cell_type": "code",
264   "execution_count": null,
265   "metadata": {},
266   "outputs": [],
267   "source": [
268    "# Time Series Split ์‹œ๊ฐํ™”\n",
269    "fig, ax = plt.subplots(figsize=(12, 6))\n",
270    "\n",
271    "for i, (train, test) in enumerate(tscv.split(X)):\n",
272    "    # Train set\n",
273    "    ax.barh(i, len(train), left=train[0], height=0.4, \n",
274    "            align='center', color='blue', alpha=0.6, label='Train' if i == 0 else '')\n",
275    "    # Test set\n",
276    "    ax.barh(i, len(test), left=test[0], height=0.4, \n",
277    "            align='center', color='red', alpha=0.6, label='Test' if i == 0 else '')\n",
278    "\n",
279    "ax.set_yticks(range(tscv.n_splits))\n",
280    "ax.set_yticklabels([f'Fold {i+1}' for i in range(tscv.n_splits)])\n",
281    "ax.set_xlabel('Sample Index', fontsize=12)\n",
282    "ax.set_title('Time Series Split Visualization', fontsize=14, pad=20)\n",
283    "ax.legend(loc='upper left', fontsize=11)\n",
284    "plt.tight_layout()\n",
285    "plt.show()"
286   ]
287  },
288  {
289   "cell_type": "markdown",
290   "metadata": {},
291   "source": [
292    "## 2. cross_val_score vs cross_validate"
293   ]
294  },
295  {
296   "cell_type": "code",
297   "execution_count": null,
298   "metadata": {},
299   "outputs": [],
300   "source": [
301    "# cross_validate: ์—ฌ๋Ÿฌ ์ง€ํ‘œ ๋™์‹œ ํ‰๊ฐ€\n",
302    "scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']\n",
303    "\n",
304    "cv_results = cross_validate(\n",
305    "    model, X, y,\n",
306    "    cv=5,\n",
307    "    scoring=scoring,\n",
308    "    return_train_score=True\n",
309    ")\n",
310    "\n",
311    "print(\"=== cross_validate ๊ฒฐ๊ณผ ===\")\n",
312    "for metric in scoring:\n",
313    "    train_key = f'train_{metric}'\n",
314    "    test_key = f'test_{metric}'\n",
315    "    print(f\"\\n{metric}:\")\n",
316    "    print(f\"  Train: {cv_results[train_key].mean():.4f} (+/- {cv_results[train_key].std():.4f})\")\n",
317    "    print(f\"  Test:  {cv_results[test_key].mean():.4f} (+/- {cv_results[test_key].std():.4f})\")\n",
318    "\n",
319    "print(f\"\\nํ‰๊ท  ํ•™์Šต ์‹œ๊ฐ„: {cv_results['fit_time'].mean():.4f}์ดˆ\")\n",
320    "print(f\"ํ‰๊ท  ์˜ˆ์ธก ์‹œ๊ฐ„: {cv_results['score_time'].mean():.4f}์ดˆ\")"
321   ]
322  },
323  {
324   "cell_type": "code",
325   "execution_count": null,
326   "metadata": {},
327   "outputs": [],
328   "source": [
329    "# ๊ฒฐ๊ณผ ์‹œ๊ฐํ™”\n",
330    "metrics_df = pd.DataFrame({\n",
331    "    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],\n",
332    "    'Train': [cv_results[f'train_{m}'].mean() for m in scoring],\n",
333    "    'Test': [cv_results[f'test_{m}'].mean() for m in scoring]\n",
334    "})\n",
335    "\n",
336    "x = np.arange(len(metrics_df))\n",
337    "width = 0.35\n",
338    "\n",
339    "fig, ax = plt.subplots(figsize=(10, 6))\n",
340    "bars1 = ax.bar(x - width/2, metrics_df['Train'], width, label='Train', alpha=0.8)\n",
341    "bars2 = ax.bar(x + width/2, metrics_df['Test'], width, label='Test', alpha=0.8)\n",
342    "\n",
343    "ax.set_xlabel('Metrics', fontsize=12)\n",
344    "ax.set_ylabel('Score', fontsize=12)\n",
345    "ax.set_title('Train vs Test Scores (5-Fold CV)', fontsize=14, pad=20)\n",
346    "ax.set_xticks(x)\n",
347    "ax.set_xticklabels(metrics_df['Metric'])\n",
348    "ax.legend(fontsize=11)\n",
349    "ax.set_ylim([0.9, 1.0])\n",
350    "ax.grid(True, alpha=0.3, axis='y')\n",
351    "\n",
352    "plt.tight_layout()\n",
353    "plt.show()"
354   ]
355  },
356  {
357   "cell_type": "markdown",
358   "metadata": {},
359   "source": [
360    "## 3. ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹\n",
361    "\n",
362    "### 3.1 Grid Search"
363   ]
364  },
365  {
366   "cell_type": "code",
367   "execution_count": null,
368   "metadata": {},
369   "outputs": [],
370   "source": [
371    "# Breast Cancer ๋ฐ์ดํ„ฐ์…‹\n",
372    "cancer = load_breast_cancer()\n",
373    "X_cancer, y_cancer = cancer.data, cancer.target\n",
374    "\n",
375    "# ์Šค์ผ€์ผ๋ง\n",
376    "scaler = StandardScaler()\n",
377    "X_scaled = scaler.fit_transform(X_cancer)\n",
378    "\n",
379    "# ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ๊ทธ๋ฆฌ๋“œ\n",
380    "param_grid = {\n",
381    "    'C': [0.1, 1, 10, 100],\n",
382    "    'gamma': [1, 0.1, 0.01, 0.001],\n",
383    "    'kernel': ['rbf', 'linear']\n",
384    "}\n",
385    "\n",
386    "print(\"=== Grid Search ===\")\n",
387    "print(f\"ํŒŒ๋ผ๋ฏธํ„ฐ ์กฐํ•ฉ ์ˆ˜: {len(param_grid['C']) * len(param_grid['gamma']) * len(param_grid['kernel'])}\")\n",
388    "print(f\"CV Folds: 5\")\n",
389    "print(f\"์ด fit ํšŸ์ˆ˜: {32 * 5} = 160\\n\")"
390   ]
391  },
392  {
393   "cell_type": "code",
394   "execution_count": null,
395   "metadata": {},
396   "outputs": [],
397   "source": [
398    "# Grid Search ์‹คํ–‰\n",
399    "grid_search = GridSearchCV(\n",
400    "    SVC(random_state=42),\n",
401    "    param_grid,\n",
402    "    cv=5,\n",
403    "    scoring='accuracy',\n",
404    "    verbose=1,\n",
405    "    n_jobs=-1  # ๋ชจ๋“  CPU ์‚ฌ์šฉ\n",
406    ")\n",
407    "\n",
408    "grid_search.fit(X_scaled, y_cancer)\n",
409    "\n",
410    "print(\"\\n=== Grid Search ๊ฒฐ๊ณผ ===\")\n",
411    "print(f\"์ตœ์  ํŒŒ๋ผ๋ฏธํ„ฐ: {grid_search.best_params_}\")\n",
412    "print(f\"์ตœ์  ์ ์ˆ˜: {grid_search.best_score_:.4f}\")\n",
413    "print(f\"์ตœ์  ๋ชจ๋ธ: {grid_search.best_estimator_}\")"
414   ]
415  },
416  {
417   "cell_type": "code",
418   "execution_count": null,
419   "metadata": {},
420   "outputs": [],
421   "source": [
422    "# ๋ชจ๋“  ๊ฒฐ๊ณผ ํ™•์ธ\n",
423    "results_df = pd.DataFrame(grid_search.cv_results_)\n",
424    "\n",
425    "# ์ƒ์œ„ 10๊ฐœ ์กฐํ•ฉ\n",
426    "top_results = results_df.nsmallest(10, 'rank_test_score')[[\n",
427    "    'params', 'mean_test_score', 'std_test_score', 'rank_test_score'\n",
428    "]]\n",
429    "\n",
430    "print(\"\\n=== ์ƒ์œ„ 10๊ฐœ ํŒŒ๋ผ๋ฏธํ„ฐ ์กฐํ•ฉ ===\")\n",
431    "for idx, row in top_results.iterrows():\n",
432    "    print(f\"Rank {int(row['rank_test_score'])}: {row['params']}\")\n",
433    "    print(f\"  Score: {row['mean_test_score']:.4f} (+/- {row['std_test_score']:.4f})\\n\")"
434   ]
435  },
436  {
437   "cell_type": "code",
438   "execution_count": null,
439   "metadata": {},
440   "outputs": [],
441   "source": [
442    "# Grid Search ๊ฒฐ๊ณผ ํžˆํŠธ๋งต (C vs gamma, rbf kernel)\n",
443    "rbf_results = results_df[results_df['param_kernel'] == 'rbf']\n",
444    "\n",
445    "# Pivot table ์ƒ์„ฑ\n",
446    "pivot_table = rbf_results.pivot_table(\n",
447    "    values='mean_test_score',\n",
448    "    index='param_gamma',\n",
449    "    columns='param_C'\n",
450    ")\n",
451    "\n",
452    "plt.figure(figsize=(10, 8))\n",
453    "sns.heatmap(pivot_table, annot=True, fmt='.4f', cmap='YlGnBu', \n",
454    "            cbar_kws={'label': 'Accuracy'})\n",
455    "plt.title('Grid Search Results (RBF Kernel): C vs Gamma', fontsize=14, pad=20)\n",
456    "plt.xlabel('C (Regularization Parameter)', fontsize=12)\n",
457    "plt.ylabel('Gamma', fontsize=12)\n",
458    "plt.tight_layout()\n",
459    "plt.show()"
460   ]
461  },
462  {
463   "cell_type": "markdown",
464   "metadata": {},
465   "source": [
466    "### 3.2 Randomized Search"
467   ]
468  },
469  {
470   "cell_type": "code",
471   "execution_count": null,
472   "metadata": {},
473   "outputs": [],
474   "source": [
475    "# ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ๋ถ„ํฌ ์ •์˜\n",
476    "param_distributions = {\n",
477    "    'C': uniform(0.1, 100),  # 0.1 ~ 100.1 ๊ท ๋“ฑ ๋ถ„ํฌ\n",
478    "    'gamma': uniform(0.001, 1),  # 0.001 ~ 1.001 ๊ท ๋“ฑ ๋ถ„ํฌ\n",
479    "    'kernel': ['rbf', 'linear', 'poly']\n",
480    "}\n",
481    "\n",
482    "# Randomized Search ์‹คํ–‰\n",
483    "random_search = RandomizedSearchCV(\n",
484    "    SVC(random_state=42),\n",
485    "    param_distributions,\n",
486    "    n_iter=50,  # 50๊ฐœ ์กฐํ•ฉ ์‹œ๋„\n",
487    "    cv=5,\n",
488    "    scoring='accuracy',\n",
489    "    random_state=42,\n",
490    "    verbose=1,\n",
491    "    n_jobs=-1\n",
492    ")\n",
493    "\n",
494    "random_search.fit(X_scaled, y_cancer)\n",
495    "\n",
496    "print(\"\\n=== Randomized Search ๊ฒฐ๊ณผ ===\")\n",
497    "print(f\"์ตœ์  ํŒŒ๋ผ๋ฏธํ„ฐ: {random_search.best_params_}\")\n",
498    "print(f\"์ตœ์  ์ ์ˆ˜: {random_search.best_score_:.4f}\")"
499   ]
500  },
501  {
502   "cell_type": "code",
503   "execution_count": null,
504   "metadata": {},
505   "outputs": [],
506   "source": [
507    "# Grid Search vs Randomized Search ๋น„๊ต\n",
508    "comparison_df = pd.DataFrame({\n",
509    "    'Method': ['Grid Search', 'Randomized Search'],\n",
510    "    'Best Score': [grid_search.best_score_, random_search.best_score_],\n",
511    "    'N Iterations': [len(grid_search.cv_results_['params']), \n",
512    "                     len(random_search.cv_results_['params'])]\n",
513    "})\n",
514    "\n",
515    "print(\"\\n=== Grid Search vs Randomized Search ===\")\n",
516    "print(comparison_df.to_string(index=False))\n",
517    "print(\"\\nRandomized Search:\")\n",
518    "print(\"  - ์žฅ์ : ๊ณ„์‚ฐ ํšจ์œจ์ , ์—ฐ์† ๋ถ„ํฌ ํƒ์ƒ‰ ๊ฐ€๋Šฅ\")\n",
519    "print(\"  - ๋‹จ์ : ์ตœ์ ํ•ด ๋ณด์žฅ ์—†์Œ\")\n",
520    "print(\"\\nGrid Search:\")\n",
521    "print(\"  - ์žฅ์ : ๋ชจ๋“  ์กฐํ•ฉ ํƒ์ƒ‰, ์ตœ์ ํ•ด ๋ณด์žฅ (๊ทธ๋ฆฌ๋“œ ๋‚ด)\")\n",
522    "print(\"  - ๋‹จ์ : ์กฐํ•ฉ ์ˆ˜๊ฐ€ ๊ธฐํ•˜๊ธ‰์ˆ˜์ ์œผ๋กœ ์ฆ๊ฐ€\")"
523   ]
524  },
525  {
526   "cell_type": "markdown",
527   "metadata": {},
528   "source": [
529    "### 3.3 Random Forest ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹"
530   ]
531  },
532  {
533   "cell_type": "code",
534   "execution_count": null,
535   "metadata": {},
536   "outputs": [],
537   "source": [
538    "# Random Forest ํŒŒ๋ผ๋ฏธํ„ฐ ๊ทธ๋ฆฌ๋“œ\n",
539    "rf_param_grid = {\n",
540    "    'n_estimators': [50, 100, 200],\n",
541    "    'max_depth': [None, 10, 20, 30],\n",
542    "    'min_samples_split': [2, 5, 10],\n",
543    "    'min_samples_leaf': [1, 2, 4]\n",
544    "}\n",
545    "\n",
546    "rf_grid_search = GridSearchCV(\n",
547    "    RandomForestClassifier(random_state=42),\n",
548    "    rf_param_grid,\n",
549    "    cv=5,\n",
550    "    scoring='accuracy',\n",
551    "    verbose=1,\n",
552    "    n_jobs=-1\n",
553    ")\n",
554    "\n",
555    "rf_grid_search.fit(X_cancer, y_cancer)\n",
556    "\n",
557    "print(\"\\n=== Random Forest Grid Search ๊ฒฐ๊ณผ ===\")\n",
558    "print(f\"์ตœ์  ํŒŒ๋ผ๋ฏธํ„ฐ: {rf_grid_search.best_params_}\")\n",
559    "print(f\"์ตœ์  ์ ์ˆ˜: {rf_grid_search.best_score_:.4f}\")"
560   ]
561  },
562  {
563   "cell_type": "code",
564   "execution_count": null,
565   "metadata": {},
566   "outputs": [],
567   "source": [
568    "# Feature Importance ์‹œ๊ฐํ™”\n",
569    "best_rf = rf_grid_search.best_estimator_\n",
570    "feature_importance = pd.DataFrame({\n",
571    "    'feature': cancer.feature_names,\n",
572    "    'importance': best_rf.feature_importances_\n",
573    "}).sort_values('importance', ascending=False)\n",
574    "\n",
575    "plt.figure(figsize=(10, 8))\n",
576    "plt.barh(feature_importance['feature'][:15], feature_importance['importance'][:15])\n",
577    "plt.xlabel('Importance', fontsize=12)\n",
578    "plt.title('Top 15 Feature Importances (Optimized Random Forest)', fontsize=14, pad=20)\n",
579    "plt.gca().invert_yaxis()\n",
580    "plt.tight_layout()\n",
581    "plt.show()\n",
582    "\n",
583    "print(\"\\nTop 5 Features:\")\n",
584    "print(feature_importance.head().to_string(index=False))"
585   ]
586  },
587  {
588   "cell_type": "markdown",
589   "metadata": {},
590   "source": [
591    "## 4. ์ค‘์ฒฉ ๊ต์ฐจ๊ฒ€์ฆ (Nested Cross-Validation)"
592   ]
593  },
594  {
595   "cell_type": "code",
596   "execution_count": null,
597   "metadata": {},
598   "outputs": [],
599   "source": [
600    "# ์ค‘์ฒฉ CV: ์™ธ๋ถ€ ๋ฃจํ”„(๋ชจ๋ธ ํ‰๊ฐ€) + ๋‚ด๋ถ€ ๋ฃจํ”„(ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹)\n",
601    "\n",
602    "# ๋‚ด๋ถ€ CV (ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹)\n",
603    "param_grid_nested = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01]}\n",
604    "inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)\n",
605    "grid_search_nested = GridSearchCV(\n",
606    "    SVC(kernel='rbf', random_state=42), \n",
607    "    param_grid_nested, \n",
608    "    cv=inner_cv, \n",
609    "    scoring='accuracy'\n",
610    ")\n",
611    "\n",
612    "# ์™ธ๋ถ€ CV (๋ชจ๋ธ ํ‰๊ฐ€)\n",
613    "outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n",
614    "nested_scores = cross_val_score(\n",
615    "    grid_search_nested, \n",
616    "    X_scaled, \n",
617    "    y_cancer, \n",
618    "    cv=outer_cv, \n",
619    "    scoring='accuracy'\n",
620    ")\n",
621    "\n",
622    "print(\"=== ์ค‘์ฒฉ ๊ต์ฐจ๊ฒ€์ฆ ๊ฒฐ๊ณผ ===\")\n",
623    "print(f\"๊ฐ ์™ธ๋ถ€ ํด๋“œ ์ ์ˆ˜: {nested_scores}\")\n",
624    "print(f\"ํ‰๊ท  ์ ์ˆ˜: {nested_scores.mean():.4f} (+/- {nested_scores.std():.4f})\")\n",
625    "\n",
626    "# ๋น„๊ต: ์ผ๋ฐ˜ CV vs ์ค‘์ฒฉ CV\n",
627    "grid_search_nested.fit(X_scaled, y_cancer)\n",
628    "print(f\"\\n์ผ๋ฐ˜ CV ์ตœ์  ์ ์ˆ˜: {grid_search_nested.best_score_:.4f}\")\n",
629    "print(f\"์ค‘์ฒฉ CV ํ‰๊ท  ์ ์ˆ˜: {nested_scores.mean():.4f}\")\n",
630    "print(\"\\n์ค‘์ฒฉ CV๊ฐ€ ๋” ํ˜„์‹ค์ ์ธ ์ผ๋ฐ˜ํ™” ์„ฑ๋Šฅ์„ ์ถ”์ •ํ•ฉ๋‹ˆ๋‹ค.\")\n",
631    "print(\"์ผ๋ฐ˜ CV๋Š” ๊ณผ๋Œ€ํ‰๊ฐ€๋  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค (๋ฐ์ดํ„ฐ ๋ˆ„์ˆ˜).\")"
632   ]
633  },
634  {
635   "cell_type": "markdown",
636   "metadata": {},
637   "source": [
638    "## 5. ํŒŒ์ดํ”„๋ผ์ธ๊ณผ ํ•จ๊ป˜ ์‚ฌ์šฉ"
639   ]
640  },
641  {
642   "cell_type": "code",
643   "execution_count": null,
644   "metadata": {},
645   "outputs": [],
646   "source": [
647    "# ํŒŒ์ดํ”„๋ผ์ธ ์ •์˜\n",
648    "pipeline = Pipeline([\n",
649    "    ('scaler', StandardScaler()),\n",
650    "    ('svm', SVC(random_state=42))\n",
651    "])\n",
652    "\n",
653    "# ํŒŒ๋ผ๋ฏธํ„ฐ ์ด๋ฆ„: step__parameter\n",
654    "param_grid_pipeline = {\n",
655    "    'svm__C': [0.1, 1, 10],\n",
656    "    'svm__gamma': [0.1, 0.01, 0.001],\n",
657    "    'svm__kernel': ['rbf', 'linear']\n",
658    "}\n",
659    "\n",
660    "grid_search_pipeline = GridSearchCV(\n",
661    "    pipeline, \n",
662    "    param_grid_pipeline, \n",
663    "    cv=5, \n",
664    "    scoring='accuracy',\n",
665    "    verbose=1,\n",
666    "    n_jobs=-1\n",
667    ")\n",
668    "\n",
669    "# ์Šค์ผ€์ผ๋ง๋˜์ง€ ์•Š์€ ๋ฐ์ดํ„ฐ ์‚ฌ์šฉ (ํŒŒ์ดํ”„๋ผ์ธ์ด ์ฒ˜๋ฆฌ)\n",
670    "grid_search_pipeline.fit(X_cancer, y_cancer)\n",
671    "\n",
672    "print(\"\\n=== ํŒŒ์ดํ”„๋ผ์ธ Grid Search ๊ฒฐ๊ณผ ===\")\n",
673    "print(f\"์ตœ์  ํŒŒ๋ผ๋ฏธํ„ฐ: {grid_search_pipeline.best_params_}\")\n",
674    "print(f\"์ตœ์  ์ ์ˆ˜: {grid_search_pipeline.best_score_:.4f}\")\n",
675    "print(\"\\nํŒŒ์ดํ”„๋ผ์ธ ์‚ฌ์šฉ์˜ ์žฅ์ :\")\n",
676    "print(\"  - ์ „์ฒ˜๋ฆฌ ๋‹จ๊ณ„๋ฅผ ์ž๋™์œผ๋กœ ํฌํ•จ\")\n",
677    "print(\"  - CV์—์„œ ๋ฐ์ดํ„ฐ ๋ˆ„์ˆ˜ ๋ฐฉ์ง€\")\n",
678    "print(\"  - ์ฝ”๋“œ ๊ฐ„๊ฒฐ์„ฑ\")"
679   ]
680  },
681  {
682   "cell_type": "markdown",
683   "metadata": {},
684   "source": [
685    "## 6. ํ•™์Šต ๊ณก์„  (Learning Curves)"
686   ]
687  },
688  {
689   "cell_type": "code",
690   "execution_count": null,
691   "metadata": {},
692   "outputs": [],
693   "source": [
694    "# ํ•™์Šต ๊ณก์„  ๊ณ„์‚ฐ\n",
695    "train_sizes, train_scores, val_scores = learning_curve(\n",
696    "    grid_search_pipeline.best_estimator_,\n",
697    "    X_cancer, y_cancer,\n",
698    "    train_sizes=np.linspace(0.1, 1.0, 10),\n",
699    "    cv=5,\n",
700    "    scoring='accuracy',\n",
701    "    n_jobs=-1\n",
702    ")\n",
703    "\n",
704    "# ํ‰๊ท  ๋ฐ ํ‘œ์ค€ํŽธ์ฐจ\n",
705    "train_mean = train_scores.mean(axis=1)\n",
706    "train_std = train_scores.std(axis=1)\n",
707    "val_mean = val_scores.mean(axis=1)\n",
708    "val_std = val_scores.std(axis=1)\n",
709    "\n",
710    "# ํ•™์Šต ๊ณก์„  ์‹œ๊ฐํ™”\n",
711    "plt.figure(figsize=(10, 6))\n",
712    "plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, \n",
713    "                 alpha=0.2, color='blue')\n",
714    "plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, \n",
715    "                 alpha=0.2, color='orange')\n",
716    "plt.plot(train_sizes, train_mean, 'o-', color='blue', linewidth=2, \n",
717    "         label='Training Score')\n",
718    "plt.plot(train_sizes, val_mean, 'o-', color='orange', linewidth=2, \n",
719    "         label='Validation Score')\n",
720    "plt.xlabel('Training Set Size', fontsize=12)\n",
721    "plt.ylabel('Accuracy', fontsize=12)\n",
722    "plt.title('Learning Curve', fontsize=14, pad=20)\n",
723    "plt.legend(loc='best', fontsize=11)\n",
724    "plt.grid(True, alpha=0.3)\n",
725    "plt.show()\n",
726    "\n",
727    "print(\"ํ•™์Šต ๊ณก์„  ํ•ด์„:\")\n",
728    "print(\"  - ๋‘ ๊ณก์„ ์ด ๋ชจ๋‘ ๋‚ฎ์Œ โ†’ ๊ณผ์†Œ์ ํ•ฉ\")\n",
729    "print(\"  - ํ›ˆ๋ จ ๊ณก์„  ๋†’๊ณ  ๊ฒ€์ฆ ๊ณก์„  ๋‚ฎ์Œ โ†’ ๊ณผ์ ํ•ฉ\")\n",
730    "print(\"  - ๋‘ ๊ณก์„ ์ด ์ˆ˜๋ ด โ†’ ์ ์ ˆํ•œ ์ ํ•ฉ\")"
731   ]
732  },
733  {
734   "cell_type": "markdown",
735   "metadata": {},
736   "source": [
737    "## 7. ๊ฒ€์ฆ ๊ณก์„  (Validation Curve)"
738   ]
739  },
740  {
741   "cell_type": "code",
742   "execution_count": null,
743   "metadata": {},
744   "outputs": [],
745   "source": [
746    "# C ํŒŒ๋ผ๋ฏธํ„ฐ์— ๋Œ€ํ•œ ๊ฒ€์ฆ ๊ณก์„ \n",
747    "param_range = np.logspace(-4, 2, 10)\n",
748    "\n",
749    "train_scores_val, test_scores_val = validation_curve(\n",
750    "    SVC(kernel='rbf', gamma=0.01, random_state=42),\n",
751    "    X_scaled, y_cancer,\n",
752    "    param_name='C',\n",
753    "    param_range=param_range,\n",
754    "    cv=5,\n",
755    "    scoring='accuracy',\n",
756    "    n_jobs=-1\n",
757    ")\n",
758    "\n",
759    "train_mean_val = train_scores_val.mean(axis=1)\n",
760    "train_std_val = train_scores_val.std(axis=1)\n",
761    "test_mean_val = test_scores_val.mean(axis=1)\n",
762    "test_std_val = test_scores_val.std(axis=1)\n",
763    "\n",
764    "# ๊ฒ€์ฆ ๊ณก์„  ์‹œ๊ฐํ™”\n",
765    "plt.figure(figsize=(10, 6))\n",
766    "plt.semilogx(param_range, train_mean_val, 'o-', color='blue', linewidth=2, \n",
767    "             label='Training Score')\n",
768    "plt.semilogx(param_range, test_mean_val, 'o-', color='orange', linewidth=2, \n",
769    "             label='Validation Score')\n",
770    "plt.fill_between(param_range, train_mean_val - train_std_val, \n",
771    "                 train_mean_val + train_std_val, alpha=0.2, color='blue')\n",
772    "plt.fill_between(param_range, test_mean_val - test_std_val, \n",
773    "                 test_mean_val + test_std_val, alpha=0.2, color='orange')\n",
774    "plt.xlabel('C (Regularization Parameter)', fontsize=12)\n",
775    "plt.ylabel('Accuracy', fontsize=12)\n",
776    "plt.title('Validation Curve (SVM RBF)', fontsize=14, pad=20)\n",
777    "plt.legend(loc='best', fontsize=11)\n",
778    "plt.grid(True, alpha=0.3)\n",
779    "plt.show()\n",
780    "\n",
781    "print(\"๊ฒ€์ฆ ๊ณก์„  ํ•ด์„:\")\n",
782    "print(\"  - ์™ผ์ชฝ(์ž‘์€ C): ๊ณผ์†Œ์ ํ•ฉ (์ •๊ทœํ™” ๊ฐ•ํ•จ)\")\n",
783    "print(\"  - ์ค‘๊ฐ„: ์ ์ ˆํ•œ ๋ณต์žก๋„\")\n",
784    "print(\"  - ์˜ค๋ฅธ์ชฝ(ํฐ C): ๊ณผ์ ํ•ฉ ๊ฐ€๋Šฅ์„ฑ (์ •๊ทœํ™” ์•ฝํ•จ)\")"
785   ]
786  },
787  {
788   "cell_type": "markdown",
789   "metadata": {},
790   "source": [
791    "## 8. ์ปค์Šคํ…€ ์Šค์ฝ”์–ด๋ง ํ•จ์ˆ˜"
792   ]
793  },
794  {
795   "cell_type": "code",
796   "execution_count": null,
797   "metadata": {},
798   "outputs": [],
799   "source": [
800    "# ์ปค์Šคํ…€ ์Šค์ฝ”์–ด๋ง ํ•จ์ˆ˜\n",
801    "def custom_f1_score(y_true, y_pred):\n",
802    "    \"\"\"๊ฐ€์ค‘ ํ‰๊ท  F1-score\"\"\"\n",
803    "    return f1_score(y_true, y_pred, average='weighted')\n",
804    "\n",
805    "custom_scorer = make_scorer(custom_f1_score)\n",
806    "\n",
807    "# ์ปค์Šคํ…€ ์Šค์ฝ”์–ด๋Ÿฌ ์‚ฌ์šฉ\n",
808    "scores_custom = cross_val_score(\n",
809    "    LogisticRegression(max_iter=1000, random_state=42), \n",
810    "    X, y, \n",
811    "    cv=5, \n",
812    "    scoring=custom_scorer\n",
813    ")\n",
814    "\n",
815    "print(\"=== ์ปค์Šคํ…€ ์Šค์ฝ”์–ด๋ง ํ•จ์ˆ˜ ===\")\n",
816    "print(f\"์ปค์Šคํ…€ F1-score: {scores_custom.mean():.4f} (+/- {scores_custom.std():.4f})\")\n",
817    "\n",
818    "# ๋‚ด์žฅ ์Šค์ฝ”์–ด๋ง ํ•จ์ˆ˜๋“ค\n",
819    "print(\"\\n๋‚ด์žฅ ์Šค์ฝ”์–ด๋ง ํ•จ์ˆ˜:\")\n",
820    "print(\"  ๋ถ„๋ฅ˜: 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'\")\n",
821    "print(\"  ํšŒ๊ท€: 'r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'\")"
822   ]
823  },
824  {
825   "cell_type": "markdown",
826   "metadata": {},
827   "source": [
828    "## 9. ๊ฒฐ๊ณผ ์ €์žฅ ๋ฐ ๋กœ๋“œ"
829   ]
830  },
831  {
832   "cell_type": "code",
833   "execution_count": null,
834   "metadata": {},
835   "outputs": [],
836   "source": [
837    "import joblib\n",
838    "import json\n",
839    "\n",
840    "# ์ตœ์  ๋ชจ๋ธ ์ €์žฅ\n",
841    "best_model = grid_search_pipeline.best_estimator_\n",
842    "joblib.dump(best_model, 'best_model.pkl')\n",
843    "print(\"์ตœ์  ๋ชจ๋ธ์ด 'best_model.pkl'์— ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.\")\n",
844    "\n",
845    "# ๊ฒฐ๊ณผ ์ €์žฅ\n",
846    "results = {\n",
847    "    'best_params': grid_search_pipeline.best_params_,\n",
848    "    'best_score': grid_search_pipeline.best_score_,\n",
849    "    'cv_results': {\n",
850    "        k: v.tolist() if isinstance(v, np.ndarray) else v\n",
851    "        for k, v in grid_search_pipeline.cv_results_.items()\n",
852    "        if k in ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']\n",
853    "    }\n",
854    "}\n",
855    "\n",
856    "with open('tuning_results.json', 'w') as f:\n",
857    "    json.dump(results, f, indent=2)\n",
858    "print(\"ํŠœ๋‹ ๊ฒฐ๊ณผ๊ฐ€ 'tuning_results.json'์— ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.\")\n",
859    "\n",
860    "# ๋ชจ๋ธ ๋กœ๋“œ\n",
861    "loaded_model = joblib.load('best_model.pkl')\n",
862    "print(f\"\\n๋กœ๋“œ๋œ ๋ชจ๋ธ: {loaded_model}\")"
863   ]
864  },
865  {
866   "cell_type": "markdown",
867   "metadata": {},
868   "source": [
869    "## ์š”์•ฝ\n",
870    "\n",
871    "### ๊ต์ฐจ๊ฒ€์ฆ ๋ฐฉ๋ฒ• ์„ ํƒ\n",
872    "\n",
873    "| ๊ธฐ๋ฒ• | ์šฉ๋„ | ํŠน์ง• |\n",
874    "|------|------|------|\n",
875    "| K-Fold | ์ผ๋ฐ˜์ ์ธ ํ‰๊ฐ€ | ๋ฐ์ดํ„ฐ๋ฅผ K๊ฐœ๋กœ ๋ถ„ํ•  |\n",
876    "| Stratified K-Fold | ๋ถˆ๊ท ํ˜• ๋ฐ์ดํ„ฐ | ํด๋ž˜์Šค ๋น„์œจ ์œ ์ง€ |\n",
877    "| Time Series Split | ์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ | ์‹œ๊ฐ„ ์ˆœ์„œ ์œ ์ง€ |\n",
878    "| Leave-One-Out | ์ž‘์€ ๋ฐ์ดํ„ฐ์…‹ | ๊ณ„์‚ฐ ๋น„์šฉ ๋†’์Œ |\n",
879    "\n",
880    "### ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹ ๋ฐฉ๋ฒ•\n",
881    "\n",
882    "| ๋ฐฉ๋ฒ• | ์žฅ์  | ๋‹จ์  | ์‚ฌ์šฉ ์‹œ๊ธฐ |\n",
883    "|------|------|------|----------|\n",
884    "| Grid Search | ์™„์ „ ํƒ์ƒ‰ | ๊ณ„์‚ฐ ๋น„์šฉ ๋†’์Œ | ํŒŒ๋ผ๋ฏธํ„ฐ ์ ๊ณ  ๋ฒ”์œ„ ๋ช…ํ™• |\n",
885    "| Randomized Search | ํšจ์œจ์  | ์ตœ์ ํ•ด ๋ณด์žฅ ์—†์Œ | ํŒŒ๋ผ๋ฏธํ„ฐ ๋งŽ๊ณ  ๋ฒ”์œ„ ๋ถˆํ™•์‹ค |\n",
886    "| Nested CV | ์‹ ๋ขฐ์„ฑ ๋†’์Œ | ๊ณ„์‚ฐ ๋น„์šฉ ๋งค์šฐ ๋†’์Œ | ์—ฐ๊ตฌ, ๋ฒค์น˜๋งˆํฌ |\n",
887    "\n",
888    "### ์‹ค์ „ ํŒ\n",
889    "\n",
890    "1. **์ž‘์€ ๋ฐ์ดํ„ฐ์…‹**: Stratified K-Fold (k=5 or 10)\n",
891    "2. **ํฐ ๋ฐ์ดํ„ฐ์…‹**: Stratified K-Fold (k=3) ๋˜๋Š” ๋‹จ์ผ train/test split\n",
892    "3. **์‹œ๊ณ„์—ด**: Time Series Split\n",
893    "4. **ํŒŒ๋ผ๋ฏธํ„ฐ ํƒ์ƒ‰**: Grid Search (์ข์€ ๋ฒ”์œ„) โ†’ Randomized Search (๋„“์€ ๋ฒ”์œ„)\n",
894    "5. **ํŒŒ์ดํ”„๋ผ์ธ**: ์ „์ฒ˜๋ฆฌ ํฌํ•จํ•˜์—ฌ ๋ฐ์ดํ„ฐ ๋ˆ„์ˆ˜ ๋ฐฉ์ง€"
895   ]
896  }
897 ],
898 "metadata": {
899  "kernelspec": {
900   "display_name": "Python 3",
901   "language": "python",
902   "name": "python3"
903  },
904  "language_info": {
905   "codemirror_mode": {
906    "name": "ipython",
907    "version": 3
908   },
909   "file_extension": ".py",
910   "mimetype": "text/x-python",
911   "name": "python",
912   "nbconvert_exporter": "python",
913   "pygments_lexer": "ipython3",
914   "version": "3.8.0"
915  }
916 },
917 "nbformat": 4,
918 "nbformat_minor": 4
919}