04_cross_validation.ipynb

  1{
  2 "cells": [
  3  {
  4   "cell_type": "markdown",
  5   "metadata": {},
  6   "source": [
  7    "# 교차검증과 하이퍼파라미터 튜닝\n",
  8    "\n",
  9    "이 노트북에서는 모델의 일반화 성능을 평가하는 교차검증과 최적의 하이퍼파라미터를 찾는 방법을 학습합니다.\n",
 10    "\n",
 11    "## 목차\n",
 12    "1. 교차검증 (Cross-Validation)\n",
 13    "   - K-Fold Cross-Validation\n",
 14    "   - Stratified K-Fold\n",
 15    "   - Leave-One-Out CV\n",
 16    "   - Time Series Split\n",
 17    "2. 하이퍼파라미터 튜닝\n",
 18    "   - Grid Search\n",
 19    "   - Randomized Search\n",
 20    "3. 고급 기법\n",
 21    "   - Nested Cross-Validation\n",
 22    "   - Learning Curves"
 23   ]
 24  },
 25  {
 26   "cell_type": "code",
 27   "execution_count": null,
 28   "metadata": {},
 29   "outputs": [],
 30   "source": [
 31    "# 필요한 라이브러리 임포트\n",
 32    "import numpy as np\n",
 33    "import pandas as pd\n",
 34    "import matplotlib.pyplot as plt\n",
 35    "import seaborn as sns\n",
 36    "from sklearn.datasets import load_iris, load_breast_cancer, load_diabetes\n",
 37    "from sklearn.model_selection import (\n",
 38    "    cross_val_score, cross_validate,\n",
 39    "    KFold, StratifiedKFold, LeaveOneOut, ShuffleSplit,\n",
 40    "    TimeSeriesSplit, RepeatedKFold,\n",
 41    "    GridSearchCV, RandomizedSearchCV,\n",
 42    "    learning_curve, validation_curve\n",
 43    ")\n",
 44    "from sklearn.linear_model import LogisticRegression, LinearRegression\n",
 45    "from sklearn.svm import SVC\n",
 46    "from sklearn.ensemble import RandomForestClassifier\n",
 47    "from sklearn.preprocessing import StandardScaler\n",
 48    "from sklearn.pipeline import Pipeline\n",
 49    "from sklearn.metrics import make_scorer, f1_score, accuracy_score\n",
 50    "from scipy.stats import uniform, randint\n",
 51    "\n",
 52    "# 시각화 설정\n",
 53    "plt.rcParams['figure.figsize'] = (10, 6)\n",
 54    "plt.rcParams['font.family'] = 'AppleGothic'  # MacOS용 한글 폰트\n",
 55    "plt.rcParams['axes.unicode_minus'] = False\n",
 56    "sns.set_style('whitegrid')\n",
 57    "\n",
 58    "# 경고 무시\n",
 59    "import warnings\n",
 60    "warnings.filterwarnings('ignore')"
 61   ]
 62  },
 63  {
 64   "cell_type": "markdown",
 65   "metadata": {},
 66   "source": [
 67    "## 1. 교차검증 (Cross-Validation)\n",
 68    "\n",
 69    "### 1.1 K-Fold Cross-Validation"
 70   ]
 71  },
 72  {
 73   "cell_type": "code",
 74   "execution_count": null,
 75   "metadata": {},
 76   "outputs": [],
 77   "source": [
 78    "# 데이터 로드\n",
 79    "iris = load_iris()\n",
 80    "X, y = iris.data, iris.target\n",
 81    "\n",
 82    "# 모델 생성\n",
 83    "model = LogisticRegression(max_iter=1000, random_state=42)\n",
 84    "\n",
 85    "# K-Fold 교차검증 (K=5)\n",
 86    "scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')\n",
 87    "\n",
 88    "print(\"=== K-Fold 교차검증 (K=5) ===\")\n",
 89    "print(f\"각 폴드 점수: {scores}\")\n",
 90    "print(f\"평균 정확도: {scores.mean():.4f}\")\n",
 91    "print(f\"표준편차: {scores.std():.4f}\")\n",
 92    "print(f\"95% 신뢰구간: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})\")"
 93   ]
 94  },
 95  {
 96   "cell_type": "code",
 97   "execution_count": null,
 98   "metadata": {},
 99   "outputs": [],
100   "source": [
101    "# K-Fold 수동 구현으로 이해하기\n",
102    "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n",
103    "\n",
104    "print(\"\\n=== K-Fold 분할 시각화 ===\")\n",
105    "fold_scores = []\n",
106    "\n",
107    "for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):\n",
108    "    # 데이터 분할\n",
109    "    X_train, X_val = X[train_idx], X[val_idx]\n",
110    "    y_train, y_val = y[train_idx], y[val_idx]\n",
111    "    \n",
112    "    # 모델 학습 및 평가\n",
113    "    model_fold = LogisticRegression(max_iter=1000, random_state=42)\n",
114    "    model_fold.fit(X_train, y_train)\n",
115    "    score = model_fold.score(X_val, y_val)\n",
116    "    fold_scores.append(score)\n",
117    "    \n",
118    "    print(f\"Fold {fold}: Train={len(train_idx)}, Val={len(val_idx)}, Accuracy={score:.4f}\")\n",
119    "\n",
120    "print(f\"\\n평균 정확도: {np.mean(fold_scores):.4f}\")"
121   ]
122  },
123  {
124   "cell_type": "code",
125   "execution_count": null,
126   "metadata": {},
127   "outputs": [],
128   "source": [
129    "# K 값에 따른 성능 비교\n",
130    "k_values = [3, 5, 10, 15, 20]\n",
131    "mean_scores = []\n",
132    "std_scores = []\n",
133    "\n",
134    "for k in k_values:\n",
135    "    scores = cross_val_score(model, X, y, cv=k, scoring='accuracy')\n",
136    "    mean_scores.append(scores.mean())\n",
137    "    std_scores.append(scores.std())\n",
138    "\n",
139    "# 시각화\n",
140    "plt.figure(figsize=(10, 6))\n",
141    "plt.errorbar(k_values, mean_scores, yerr=std_scores, marker='o', \n",
142    "             capsize=5, capthick=2, linewidth=2)\n",
143    "plt.xlabel('K (Number of Folds)', fontsize=12)\n",
144    "plt.ylabel('Mean Accuracy', fontsize=12)\n",
145    "plt.title('K-Fold CV Performance vs K Value', fontsize=14, pad=20)\n",
146    "plt.grid(True, alpha=0.3)\n",
147    "plt.show()\n",
148    "\n",
149    "print(\"\\n=== K 값에 따른 성능 ===\")\n",
150    "for k, mean, std in zip(k_values, mean_scores, std_scores):\n",
151    "    print(f\"K={k:2d}: {mean:.4f} (+/- {std:.4f})\")"
152   ]
153  },
154  {
155   "cell_type": "markdown",
156   "metadata": {},
157   "source": [
158    "### 1.2 Stratified K-Fold (계층화 K-Fold)"
159   ]
160  },
161  {
162   "cell_type": "code",
163   "execution_count": null,
164   "metadata": {},
165   "outputs": [],
166   "source": [
167    "# 클래스 비율 확인\n",
168    "unique, counts = np.unique(y, return_counts=True)\n",
169    "print(\"전체 데이터 클래스 분포:\")\n",
170    "for cls, cnt in zip(unique, counts):\n",
171    "    print(f\"  Class {cls}: {cnt} ({cnt/len(y)*100:.1f}%)\")\n",
172    "\n",
173    "# Stratified K-Fold\n",
174    "skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n",
175    "\n",
176    "print(\"\\n=== Stratified K-Fold 각 폴드의 클래스 분포 ===\")\n",
177    "for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):\n",
178    "    train_classes = np.bincount(y[train_idx])\n",
179    "    val_classes = np.bincount(y[val_idx])\n",
180    "    \n",
181    "    print(f\"Fold {fold}:\")\n",
182    "    print(f\"  Train: {train_classes} ({train_classes/train_classes.sum()*100})\")\n",
183    "    print(f\"  Val:   {val_classes} ({val_classes/val_classes.sum()*100})\")"
184   ]
185  },
186  {
187   "cell_type": "code",
188   "execution_count": null,
189   "metadata": {},
190   "outputs": [],
191   "source": [
192    "# K-Fold vs Stratified K-Fold 성능 비교\n",
193    "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n",
194    "skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n",
195    "\n",
196    "scores_kf = cross_val_score(model, X, y, cv=kf, scoring='accuracy')\n",
197    "scores_skf = cross_val_score(model, X, y, cv=skf, scoring='accuracy')\n",
198    "\n",
199    "print(\"=== K-Fold vs Stratified K-Fold ===\")\n",
200    "print(f\"K-Fold:           {scores_kf.mean():.4f} (+/- {scores_kf.std():.4f})\")\n",
201    "print(f\"Stratified K-Fold: {scores_skf.mean():.4f} (+/- {scores_skf.std():.4f})\")\n",
202    "print(\"\\n불균형 데이터에서는 Stratified K-Fold가 더 안정적입니다.\")"
203   ]
204  },
205  {
206   "cell_type": "markdown",
207   "metadata": {},
208   "source": [
209    "### 1.3 다양한 교차검증 방법"
210   ]
211  },
212  {
213   "cell_type": "code",
214   "execution_count": null,
215   "metadata": {},
216   "outputs": [],
217   "source": [
218    "# Leave-One-Out (LOO)\n",
219    "loo = LeaveOneOut()\n",
220    "print(f\"Leave-One-Out 분할 수: {loo.get_n_splits(X)} (데이터 수와 동일)\")\n",
221    "print(\"LOO는 작은 데이터셋에서 유용하지만 계산 비용이 높습니다.\\n\")\n",
222    "\n",
223    "# Shuffle Split (랜덤 분할)\n",
224    "ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)\n",
225    "scores_ss = cross_val_score(model, X, y, cv=ss, scoring='accuracy')\n",
226    "print(f\"Shuffle Split 평균: {scores_ss.mean():.4f} (+/- {scores_ss.std():.4f})\")\n",
227    "print(\"각 분할이 독립적으로 랜덤 샘플링됩니다.\\n\")\n",
228    "\n",
229    "# Repeated K-Fold (반복)\n",
230    "rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=42)\n",
231    "scores_rkf = cross_val_score(model, X, y, cv=rkf, scoring='accuracy')\n",
232    "print(f\"Repeated K-Fold 평균: {scores_rkf.mean():.4f} (+/- {scores_rkf.std():.4f})\")\n",
233    "print(f\"총 분할 수: {len(scores_rkf)} (5 folds × 10 repeats = 50)\")\n",
234    "print(\"더 안정적인 추정을 위해 여러 번 반복합니다.\")"
235   ]
236  },
237  {
238   "cell_type": "markdown",
239   "metadata": {},
240   "source": [
241    "### 1.4 시계열 교차검증 (Time Series Split)"
242   ]
243  },
244  {
245   "cell_type": "code",
246   "execution_count": null,
247   "metadata": {},
248   "outputs": [],
249   "source": [
250    "# 시계열 데이터용 교차검증\n",
251    "tscv = TimeSeriesSplit(n_splits=5)\n",
252    "\n",
253    "print(\"=== Time Series Split ===\")\n",
254    "print(\"시계열 데이터는 과거 → 미래 순서를 유지해야 합니다.\\n\")\n",
255    "\n",
256    "for fold, (train_idx, test_idx) in enumerate(tscv.split(X), 1):\n",
257    "    print(f\"Fold {fold}:\")\n",
258    "    print(f\"  Train: [{train_idx[0]:3d}:{train_idx[-1]:3d}] ({len(train_idx)} samples)\")\n",
259    "    print(f\"  Test:  [{test_idx[0]:3d}:{test_idx[-1]:3d}] ({len(test_idx)} samples)\")"
260   ]
261  },
262  {
263   "cell_type": "code",
264   "execution_count": null,
265   "metadata": {},
266   "outputs": [],
267   "source": [
268    "# Time Series Split 시각화\n",
269    "fig, ax = plt.subplots(figsize=(12, 6))\n",
270    "\n",
271    "for i, (train, test) in enumerate(tscv.split(X)):\n",
272    "    # Train set\n",
273    "    ax.barh(i, len(train), left=train[0], height=0.4, \n",
274    "            align='center', color='blue', alpha=0.6, label='Train' if i == 0 else '')\n",
275    "    # Test set\n",
276    "    ax.barh(i, len(test), left=test[0], height=0.4, \n",
277    "            align='center', color='red', alpha=0.6, label='Test' if i == 0 else '')\n",
278    "\n",
279    "ax.set_yticks(range(tscv.n_splits))\n",
280    "ax.set_yticklabels([f'Fold {i+1}' for i in range(tscv.n_splits)])\n",
281    "ax.set_xlabel('Sample Index', fontsize=12)\n",
282    "ax.set_title('Time Series Split Visualization', fontsize=14, pad=20)\n",
283    "ax.legend(loc='upper left', fontsize=11)\n",
284    "plt.tight_layout()\n",
285    "plt.show()"
286   ]
287  },
288  {
289   "cell_type": "markdown",
290   "metadata": {},
291   "source": [
292    "## 2. cross_val_score vs cross_validate"
293   ]
294  },
295  {
296   "cell_type": "code",
297   "execution_count": null,
298   "metadata": {},
299   "outputs": [],
300   "source": [
301    "# cross_validate: 여러 지표 동시 평가\n",
302    "scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']\n",
303    "\n",
304    "cv_results = cross_validate(\n",
305    "    model, X, y,\n",
306    "    cv=5,\n",
307    "    scoring=scoring,\n",
308    "    return_train_score=True\n",
309    ")\n",
310    "\n",
311    "print(\"=== cross_validate 결과 ===\")\n",
312    "for metric in scoring:\n",
313    "    train_key = f'train_{metric}'\n",
314    "    test_key = f'test_{metric}'\n",
315    "    print(f\"\\n{metric}:\")\n",
316    "    print(f\"  Train: {cv_results[train_key].mean():.4f} (+/- {cv_results[train_key].std():.4f})\")\n",
317    "    print(f\"  Test:  {cv_results[test_key].mean():.4f} (+/- {cv_results[test_key].std():.4f})\")\n",
318    "\n",
319    "print(f\"\\n평균 학습 시간: {cv_results['fit_time'].mean():.4f}초\")\n",
320    "print(f\"평균 예측 시간: {cv_results['score_time'].mean():.4f}초\")"
321   ]
322  },
323  {
324   "cell_type": "code",
325   "execution_count": null,
326   "metadata": {},
327   "outputs": [],
328   "source": [
329    "# 결과 시각화\n",
330    "metrics_df = pd.DataFrame({\n",
331    "    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],\n",
332    "    'Train': [cv_results[f'train_{m}'].mean() for m in scoring],\n",
333    "    'Test': [cv_results[f'test_{m}'].mean() for m in scoring]\n",
334    "})\n",
335    "\n",
336    "x = np.arange(len(metrics_df))\n",
337    "width = 0.35\n",
338    "\n",
339    "fig, ax = plt.subplots(figsize=(10, 6))\n",
340    "bars1 = ax.bar(x - width/2, metrics_df['Train'], width, label='Train', alpha=0.8)\n",
341    "bars2 = ax.bar(x + width/2, metrics_df['Test'], width, label='Test', alpha=0.8)\n",
342    "\n",
343    "ax.set_xlabel('Metrics', fontsize=12)\n",
344    "ax.set_ylabel('Score', fontsize=12)\n",
345    "ax.set_title('Train vs Test Scores (5-Fold CV)', fontsize=14, pad=20)\n",
346    "ax.set_xticks(x)\n",
347    "ax.set_xticklabels(metrics_df['Metric'])\n",
348    "ax.legend(fontsize=11)\n",
349    "ax.set_ylim([0.9, 1.0])\n",
350    "ax.grid(True, alpha=0.3, axis='y')\n",
351    "\n",
352    "plt.tight_layout()\n",
353    "plt.show()"
354   ]
355  },
356  {
357   "cell_type": "markdown",
358   "metadata": {},
359   "source": [
360    "## 3. 하이퍼파라미터 튜닝\n",
361    "\n",
362    "### 3.1 Grid Search"
363   ]
364  },
365  {
366   "cell_type": "code",
367   "execution_count": null,
368   "metadata": {},
369   "outputs": [],
370   "source": [
371    "# Breast Cancer 데이터셋\n",
372    "cancer = load_breast_cancer()\n",
373    "X_cancer, y_cancer = cancer.data, cancer.target\n",
374    "\n",
375    "# 스케일링\n",
376    "scaler = StandardScaler()\n",
377    "X_scaled = scaler.fit_transform(X_cancer)\n",
378    "\n",
379    "# 하이퍼파라미터 그리드\n",
380    "param_grid = {\n",
381    "    'C': [0.1, 1, 10, 100],\n",
382    "    'gamma': [1, 0.1, 0.01, 0.001],\n",
383    "    'kernel': ['rbf', 'linear']\n",
384    "}\n",
385    "\n",
386    "print(\"=== Grid Search ===\")\n",
387    "print(f\"파라미터 조합 수: {len(param_grid['C']) * len(param_grid['gamma']) * len(param_grid['kernel'])}\")\n",
388    "print(f\"CV Folds: 5\")\n",
389    "print(f\"총 fit 횟수: {32 * 5} = 160\\n\")"
390   ]
391  },
392  {
393   "cell_type": "code",
394   "execution_count": null,
395   "metadata": {},
396   "outputs": [],
397   "source": [
398    "# Grid Search 실행\n",
399    "grid_search = GridSearchCV(\n",
400    "    SVC(random_state=42),\n",
401    "    param_grid,\n",
402    "    cv=5,\n",
403    "    scoring='accuracy',\n",
404    "    verbose=1,\n",
405    "    n_jobs=-1  # 모든 CPU 사용\n",
406    ")\n",
407    "\n",
408    "grid_search.fit(X_scaled, y_cancer)\n",
409    "\n",
410    "print(\"\\n=== Grid Search 결과 ===\")\n",
411    "print(f\"최적 파라미터: {grid_search.best_params_}\")\n",
412    "print(f\"최적 점수: {grid_search.best_score_:.4f}\")\n",
413    "print(f\"최적 모델: {grid_search.best_estimator_}\")"
414   ]
415  },
416  {
417   "cell_type": "code",
418   "execution_count": null,
419   "metadata": {},
420   "outputs": [],
421   "source": [
422    "# 모든 결과 확인\n",
423    "results_df = pd.DataFrame(grid_search.cv_results_)\n",
424    "\n",
425    "# 상위 10개 조합\n",
426    "top_results = results_df.nsmallest(10, 'rank_test_score')[[\n",
427    "    'params', 'mean_test_score', 'std_test_score', 'rank_test_score'\n",
428    "]]\n",
429    "\n",
430    "print(\"\\n=== 상위 10개 파라미터 조합 ===\")\n",
431    "for idx, row in top_results.iterrows():\n",
432    "    print(f\"Rank {int(row['rank_test_score'])}: {row['params']}\")\n",
433    "    print(f\"  Score: {row['mean_test_score']:.4f} (+/- {row['std_test_score']:.4f})\\n\")"
434   ]
435  },
436  {
437   "cell_type": "code",
438   "execution_count": null,
439   "metadata": {},
440   "outputs": [],
441   "source": [
442    "# Grid Search 결과 히트맵 (C vs gamma, rbf kernel)\n",
443    "rbf_results = results_df[results_df['param_kernel'] == 'rbf']\n",
444    "\n",
445    "# Pivot table 생성\n",
446    "pivot_table = rbf_results.pivot_table(\n",
447    "    values='mean_test_score',\n",
448    "    index='param_gamma',\n",
449    "    columns='param_C'\n",
450    ")\n",
451    "\n",
452    "plt.figure(figsize=(10, 8))\n",
453    "sns.heatmap(pivot_table, annot=True, fmt='.4f', cmap='YlGnBu', \n",
454    "            cbar_kws={'label': 'Accuracy'})\n",
455    "plt.title('Grid Search Results (RBF Kernel): C vs Gamma', fontsize=14, pad=20)\n",
456    "plt.xlabel('C (Regularization Parameter)', fontsize=12)\n",
457    "plt.ylabel('Gamma', fontsize=12)\n",
458    "plt.tight_layout()\n",
459    "plt.show()"
460   ]
461  },
462  {
463   "cell_type": "markdown",
464   "metadata": {},
465   "source": [
466    "### 3.2 Randomized Search"
467   ]
468  },
469  {
470   "cell_type": "code",
471   "execution_count": null,
472   "metadata": {},
473   "outputs": [],
474   "source": [
475    "# 하이퍼파라미터 분포 정의\n",
476    "param_distributions = {\n",
477    "    'C': uniform(0.1, 100),  # 0.1 ~ 100.1 균등 분포\n",
478    "    'gamma': uniform(0.001, 1),  # 0.001 ~ 1.001 균등 분포\n",
479    "    'kernel': ['rbf', 'linear', 'poly']\n",
480    "}\n",
481    "\n",
482    "# Randomized Search 실행\n",
483    "random_search = RandomizedSearchCV(\n",
484    "    SVC(random_state=42),\n",
485    "    param_distributions,\n",
486    "    n_iter=50,  # 50개 조합 시도\n",
487    "    cv=5,\n",
488    "    scoring='accuracy',\n",
489    "    random_state=42,\n",
490    "    verbose=1,\n",
491    "    n_jobs=-1\n",
492    ")\n",
493    "\n",
494    "random_search.fit(X_scaled, y_cancer)\n",
495    "\n",
496    "print(\"\\n=== Randomized Search 결과 ===\")\n",
497    "print(f\"최적 파라미터: {random_search.best_params_}\")\n",
498    "print(f\"최적 점수: {random_search.best_score_:.4f}\")"
499   ]
500  },
501  {
502   "cell_type": "code",
503   "execution_count": null,
504   "metadata": {},
505   "outputs": [],
506   "source": [
507    "# Grid Search vs Randomized Search 비교\n",
508    "comparison_df = pd.DataFrame({\n",
509    "    'Method': ['Grid Search', 'Randomized Search'],\n",
510    "    'Best Score': [grid_search.best_score_, random_search.best_score_],\n",
511    "    'N Iterations': [len(grid_search.cv_results_['params']), \n",
512    "                     len(random_search.cv_results_['params'])]\n",
513    "})\n",
514    "\n",
515    "print(\"\\n=== Grid Search vs Randomized Search ===\")\n",
516    "print(comparison_df.to_string(index=False))\n",
517    "print(\"\\nRandomized Search:\")\n",
518    "print(\"  - 장점: 계산 효율적, 연속 분포 탐색 가능\")\n",
519    "print(\"  - 단점: 최적해 보장 없음\")\n",
520    "print(\"\\nGrid Search:\")\n",
521    "print(\"  - 장점: 모든 조합 탐색, 최적해 보장 (그리드 내)\")\n",
522    "print(\"  - 단점: 조합 수가 기하급수적으로 증가\")"
523   ]
524  },
525  {
526   "cell_type": "markdown",
527   "metadata": {},
528   "source": [
529    "### 3.3 Random Forest 하이퍼파라미터 튜닝"
530   ]
531  },
532  {
533   "cell_type": "code",
534   "execution_count": null,
535   "metadata": {},
536   "outputs": [],
537   "source": [
538    "# Random Forest 파라미터 그리드\n",
539    "rf_param_grid = {\n",
540    "    'n_estimators': [50, 100, 200],\n",
541    "    'max_depth': [None, 10, 20, 30],\n",
542    "    'min_samples_split': [2, 5, 10],\n",
543    "    'min_samples_leaf': [1, 2, 4]\n",
544    "}\n",
545    "\n",
546    "rf_grid_search = GridSearchCV(\n",
547    "    RandomForestClassifier(random_state=42),\n",
548    "    rf_param_grid,\n",
549    "    cv=5,\n",
550    "    scoring='accuracy',\n",
551    "    verbose=1,\n",
552    "    n_jobs=-1\n",
553    ")\n",
554    "\n",
555    "rf_grid_search.fit(X_cancer, y_cancer)\n",
556    "\n",
557    "print(\"\\n=== Random Forest Grid Search 결과 ===\")\n",
558    "print(f\"최적 파라미터: {rf_grid_search.best_params_}\")\n",
559    "print(f\"최적 점수: {rf_grid_search.best_score_:.4f}\")"
560   ]
561  },
562  {
563   "cell_type": "code",
564   "execution_count": null,
565   "metadata": {},
566   "outputs": [],
567   "source": [
568    "# Feature Importance 시각화\n",
569    "best_rf = rf_grid_search.best_estimator_\n",
570    "feature_importance = pd.DataFrame({\n",
571    "    'feature': cancer.feature_names,\n",
572    "    'importance': best_rf.feature_importances_\n",
573    "}).sort_values('importance', ascending=False)\n",
574    "\n",
575    "plt.figure(figsize=(10, 8))\n",
576    "plt.barh(feature_importance['feature'][:15], feature_importance['importance'][:15])\n",
577    "plt.xlabel('Importance', fontsize=12)\n",
578    "plt.title('Top 15 Feature Importances (Optimized Random Forest)', fontsize=14, pad=20)\n",
579    "plt.gca().invert_yaxis()\n",
580    "plt.tight_layout()\n",
581    "plt.show()\n",
582    "\n",
583    "print(\"\\nTop 5 Features:\")\n",
584    "print(feature_importance.head().to_string(index=False))"
585   ]
586  },
587  {
588   "cell_type": "markdown",
589   "metadata": {},
590   "source": [
591    "## 4. 중첩 교차검증 (Nested Cross-Validation)"
592   ]
593  },
594  {
595   "cell_type": "code",
596   "execution_count": null,
597   "metadata": {},
598   "outputs": [],
599   "source": [
600    "# 중첩 CV: 외부 루프(모델 평가) + 내부 루프(하이퍼파라미터 튜닝)\n",
601    "\n",
602    "# 내부 CV (하이퍼파라미터 튜닝)\n",
603    "param_grid_nested = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01]}\n",
604    "inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)\n",
605    "grid_search_nested = GridSearchCV(\n",
606    "    SVC(kernel='rbf', random_state=42), \n",
607    "    param_grid_nested, \n",
608    "    cv=inner_cv, \n",
609    "    scoring='accuracy'\n",
610    ")\n",
611    "\n",
612    "# 외부 CV (모델 평가)\n",
613    "outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n",
614    "nested_scores = cross_val_score(\n",
615    "    grid_search_nested, \n",
616    "    X_scaled, \n",
617    "    y_cancer, \n",
618    "    cv=outer_cv, \n",
619    "    scoring='accuracy'\n",
620    ")\n",
621    "\n",
622    "print(\"=== 중첩 교차검증 결과 ===\")\n",
623    "print(f\"각 외부 폴드 점수: {nested_scores}\")\n",
624    "print(f\"평균 점수: {nested_scores.mean():.4f} (+/- {nested_scores.std():.4f})\")\n",
625    "\n",
626    "# 비교: 일반 CV vs 중첩 CV\n",
627    "grid_search_nested.fit(X_scaled, y_cancer)\n",
628    "print(f\"\\n일반 CV 최적 점수: {grid_search_nested.best_score_:.4f}\")\n",
629    "print(f\"중첩 CV 평균 점수: {nested_scores.mean():.4f}\")\n",
630    "print(\"\\n중첩 CV가 더 현실적인 일반화 성능을 추정합니다.\")\n",
631    "print(\"일반 CV는 과대평가될 수 있습니다 (데이터 누수).\")"
632   ]
633  },
634  {
635   "cell_type": "markdown",
636   "metadata": {},
637   "source": [
638    "## 5. 파이프라인과 함께 사용"
639   ]
640  },
641  {
642   "cell_type": "code",
643   "execution_count": null,
644   "metadata": {},
645   "outputs": [],
646   "source": [
647    "# 파이프라인 정의\n",
648    "pipeline = Pipeline([\n",
649    "    ('scaler', StandardScaler()),\n",
650    "    ('svm', SVC(random_state=42))\n",
651    "])\n",
652    "\n",
653    "# 파라미터 이름: step__parameter\n",
654    "param_grid_pipeline = {\n",
655    "    'svm__C': [0.1, 1, 10],\n",
656    "    'svm__gamma': [0.1, 0.01, 0.001],\n",
657    "    'svm__kernel': ['rbf', 'linear']\n",
658    "}\n",
659    "\n",
660    "grid_search_pipeline = GridSearchCV(\n",
661    "    pipeline, \n",
662    "    param_grid_pipeline, \n",
663    "    cv=5, \n",
664    "    scoring='accuracy',\n",
665    "    verbose=1,\n",
666    "    n_jobs=-1\n",
667    ")\n",
668    "\n",
669    "# 스케일링되지 않은 데이터 사용 (파이프라인이 처리)\n",
670    "grid_search_pipeline.fit(X_cancer, y_cancer)\n",
671    "\n",
672    "print(\"\\n=== 파이프라인 Grid Search 결과 ===\")\n",
673    "print(f\"최적 파라미터: {grid_search_pipeline.best_params_}\")\n",
674    "print(f\"최적 점수: {grid_search_pipeline.best_score_:.4f}\")\n",
675    "print(\"\\n파이프라인 사용의 장점:\")\n",
676    "print(\"  - 전처리 단계를 자동으로 포함\")\n",
677    "print(\"  - CV에서 데이터 누수 방지\")\n",
678    "print(\"  - 코드 간결성\")"
679   ]
680  },
681  {
682   "cell_type": "markdown",
683   "metadata": {},
684   "source": [
685    "## 6. 학습 곡선 (Learning Curves)"
686   ]
687  },
688  {
689   "cell_type": "code",
690   "execution_count": null,
691   "metadata": {},
692   "outputs": [],
693   "source": [
694    "# 학습 곡선 계산\n",
695    "train_sizes, train_scores, val_scores = learning_curve(\n",
696    "    grid_search_pipeline.best_estimator_,\n",
697    "    X_cancer, y_cancer,\n",
698    "    train_sizes=np.linspace(0.1, 1.0, 10),\n",
699    "    cv=5,\n",
700    "    scoring='accuracy',\n",
701    "    n_jobs=-1\n",
702    ")\n",
703    "\n",
704    "# 평균 및 표준편차\n",
705    "train_mean = train_scores.mean(axis=1)\n",
706    "train_std = train_scores.std(axis=1)\n",
707    "val_mean = val_scores.mean(axis=1)\n",
708    "val_std = val_scores.std(axis=1)\n",
709    "\n",
710    "# 학습 곡선 시각화\n",
711    "plt.figure(figsize=(10, 6))\n",
712    "plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, \n",
713    "                 alpha=0.2, color='blue')\n",
714    "plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, \n",
715    "                 alpha=0.2, color='orange')\n",
716    "plt.plot(train_sizes, train_mean, 'o-', color='blue', linewidth=2, \n",
717    "         label='Training Score')\n",
718    "plt.plot(train_sizes, val_mean, 'o-', color='orange', linewidth=2, \n",
719    "         label='Validation Score')\n",
720    "plt.xlabel('Training Set Size', fontsize=12)\n",
721    "plt.ylabel('Accuracy', fontsize=12)\n",
722    "plt.title('Learning Curve', fontsize=14, pad=20)\n",
723    "plt.legend(loc='best', fontsize=11)\n",
724    "plt.grid(True, alpha=0.3)\n",
725    "plt.show()\n",
726    "\n",
727    "print(\"학습 곡선 해석:\")\n",
728    "print(\"  - 두 곡선이 모두 낮음 → 과소적합\")\n",
729    "print(\"  - 훈련 곡선 높고 검증 곡선 낮음 → 과적합\")\n",
730    "print(\"  - 두 곡선이 수렴 → 적절한 적합\")"
731   ]
732  },
733  {
734   "cell_type": "markdown",
735   "metadata": {},
736   "source": [
737    "## 7. 검증 곡선 (Validation Curve)"
738   ]
739  },
740  {
741   "cell_type": "code",
742   "execution_count": null,
743   "metadata": {},
744   "outputs": [],
745   "source": [
746    "# C 파라미터에 대한 검증 곡선\n",
747    "param_range = np.logspace(-4, 2, 10)\n",
748    "\n",
749    "train_scores_val, test_scores_val = validation_curve(\n",
750    "    SVC(kernel='rbf', gamma=0.01, random_state=42),\n",
751    "    X_scaled, y_cancer,\n",
752    "    param_name='C',\n",
753    "    param_range=param_range,\n",
754    "    cv=5,\n",
755    "    scoring='accuracy',\n",
756    "    n_jobs=-1\n",
757    ")\n",
758    "\n",
759    "train_mean_val = train_scores_val.mean(axis=1)\n",
760    "train_std_val = train_scores_val.std(axis=1)\n",
761    "test_mean_val = test_scores_val.mean(axis=1)\n",
762    "test_std_val = test_scores_val.std(axis=1)\n",
763    "\n",
764    "# 검증 곡선 시각화\n",
765    "plt.figure(figsize=(10, 6))\n",
766    "plt.semilogx(param_range, train_mean_val, 'o-', color='blue', linewidth=2, \n",
767    "             label='Training Score')\n",
768    "plt.semilogx(param_range, test_mean_val, 'o-', color='orange', linewidth=2, \n",
769    "             label='Validation Score')\n",
770    "plt.fill_between(param_range, train_mean_val - train_std_val, \n",
771    "                 train_mean_val + train_std_val, alpha=0.2, color='blue')\n",
772    "plt.fill_between(param_range, test_mean_val - test_std_val, \n",
773    "                 test_mean_val + test_std_val, alpha=0.2, color='orange')\n",
774    "plt.xlabel('C (Regularization Parameter)', fontsize=12)\n",
775    "plt.ylabel('Accuracy', fontsize=12)\n",
776    "plt.title('Validation Curve (SVM RBF)', fontsize=14, pad=20)\n",
777    "plt.legend(loc='best', fontsize=11)\n",
778    "plt.grid(True, alpha=0.3)\n",
779    "plt.show()\n",
780    "\n",
781    "print(\"검증 곡선 해석:\")\n",
782    "print(\"  - 왼쪽(작은 C): 과소적합 (정규화 강함)\")\n",
783    "print(\"  - 중간: 적절한 복잡도\")\n",
784    "print(\"  - 오른쪽(큰 C): 과적합 가능성 (정규화 약함)\")"
785   ]
786  },
787  {
788   "cell_type": "markdown",
789   "metadata": {},
790   "source": [
791    "## 8. 커스텀 스코어링 함수"
792   ]
793  },
794  {
795   "cell_type": "code",
796   "execution_count": null,
797   "metadata": {},
798   "outputs": [],
799   "source": [
800    "# 커스텀 스코어링 함수\n",
801    "def custom_f1_score(y_true, y_pred):\n",
802    "    \"\"\"가중 평균 F1-score\"\"\"\n",
803    "    return f1_score(y_true, y_pred, average='weighted')\n",
804    "\n",
805    "custom_scorer = make_scorer(custom_f1_score)\n",
806    "\n",
807    "# 커스텀 스코어러 사용\n",
808    "scores_custom = cross_val_score(\n",
809    "    LogisticRegression(max_iter=1000, random_state=42), \n",
810    "    X, y, \n",
811    "    cv=5, \n",
812    "    scoring=custom_scorer\n",
813    ")\n",
814    "\n",
815    "print(\"=== 커스텀 스코어링 함수 ===\")\n",
816    "print(f\"커스텀 F1-score: {scores_custom.mean():.4f} (+/- {scores_custom.std():.4f})\")\n",
817    "\n",
818    "# 내장 스코어링 함수들\n",
819    "print(\"\\n내장 스코어링 함수:\")\n",
820    "print(\"  분류: 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'\")\n",
821    "print(\"  회귀: 'r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'\")"
822   ]
823  },
824  {
825   "cell_type": "markdown",
826   "metadata": {},
827   "source": [
828    "## 9. 결과 저장 및 로드"
829   ]
830  },
831  {
832   "cell_type": "code",
833   "execution_count": null,
834   "metadata": {},
835   "outputs": [],
836   "source": [
837    "import joblib\n",
838    "import json\n",
839    "\n",
840    "# 최적 모델 저장\n",
841    "best_model = grid_search_pipeline.best_estimator_\n",
842    "joblib.dump(best_model, 'best_model.pkl')\n",
843    "print(\"최적 모델이 'best_model.pkl'에 저장되었습니다.\")\n",
844    "\n",
845    "# 결과 저장\n",
846    "results = {\n",
847    "    'best_params': grid_search_pipeline.best_params_,\n",
848    "    'best_score': grid_search_pipeline.best_score_,\n",
849    "    'cv_results': {\n",
850    "        k: v.tolist() if isinstance(v, np.ndarray) else v\n",
851    "        for k, v in grid_search_pipeline.cv_results_.items()\n",
852    "        if k in ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']\n",
853    "    }\n",
854    "}\n",
855    "\n",
856    "with open('tuning_results.json', 'w') as f:\n",
857    "    json.dump(results, f, indent=2)\n",
858    "print(\"튜닝 결과가 'tuning_results.json'에 저장되었습니다.\")\n",
859    "\n",
860    "# 모델 로드\n",
861    "loaded_model = joblib.load('best_model.pkl')\n",
862    "print(f\"\\n로드된 모델: {loaded_model}\")"
863   ]
864  },
865  {
866   "cell_type": "markdown",
867   "metadata": {},
868   "source": [
869    "## 요약\n",
870    "\n",
871    "### 교차검증 방법 선택\n",
872    "\n",
873    "| 기법 | 용도 | 특징 |\n",
874    "|------|------|------|\n",
875    "| K-Fold | 일반적인 평가 | 데이터를 K개로 분할 |\n",
876    "| Stratified K-Fold | 불균형 데이터 | 클래스 비율 유지 |\n",
877    "| Time Series Split | 시계열 데이터 | 시간 순서 유지 |\n",
878    "| Leave-One-Out | 작은 데이터셋 | 계산 비용 높음 |\n",
879    "\n",
880    "### 하이퍼파라미터 튜닝 방법\n",
881    "\n",
882    "| 방법 | 장점 | 단점 | 사용 시기 |\n",
883    "|------|------|------|----------|\n",
884    "| Grid Search | 완전 탐색 | 계산 비용 높음 | 파라미터 적고 범위 명확 |\n",
885    "| Randomized Search | 효율적 | 최적해 보장 없음 | 파라미터 많고 범위 불확실 |\n",
886    "| Nested CV | 신뢰성 높음 | 계산 비용 매우 높음 | 연구, 벤치마크 |\n",
887    "\n",
888    "### 실전 팁\n",
889    "\n",
890    "1. **작은 데이터셋**: Stratified K-Fold (k=5 or 10)\n",
891    "2. **큰 데이터셋**: Stratified K-Fold (k=3) 또는 단일 train/test split\n",
892    "3. **시계열**: Time Series Split\n",
893    "4. **파라미터 탐색**: Grid Search (좁은 범위) → Randomized Search (넓은 범위)\n",
894    "5. **파이프라인**: 전처리 포함하여 데이터 누수 방지"
895   ]
896  }
897 ],
898 "metadata": {
899  "kernelspec": {
900   "display_name": "Python 3",
901   "language": "python",
902   "name": "python3"
903  },
904  "language_info": {
905   "codemirror_mode": {
906    "name": "ipython",
907    "version": 3
908   },
909   "file_extension": ".py",
910   "mimetype": "text/x-python",
911   "name": "python",
912   "nbconvert_exporter": "python",
913   "pygments_lexer": "ipython3",
914   "version": "3.8.0"
915  }
916 },
917 "nbformat": 4,
918 "nbformat_minor": 4
919}