05_preprocessing.ipynb

  1{
  2 "cells": [
  3  {
  4   "cell_type": "markdown",
  5   "id": "cell-0",
  6   "metadata": {},
  7   "source": [
  8    "# 05. 데이터 전처리 (Data Preprocessing)\n",
  9    "\n",
 10    "## 학습 목표\n",
 11    "- 결측치 처리 전략 이해\n",
 12    "- 특성 스케일링 방법 비교\n",
 13    "- 범주형 변수 인코딩\n",
 14    "- 불균형 데이터 처리"
 15   ]
 16  },
 17  {
 18   "cell_type": "code",
 19   "execution_count": null,
 20   "id": "cell-1",
 21   "metadata": {},
 22   "outputs": [],
 23   "source": [
 24    "import numpy as np\n",
 25    "import pandas as pd\n",
 26    "import matplotlib.pyplot as plt\n",
 27    "import seaborn as sns\n",
 28    "from sklearn.model_selection import train_test_split\n",
 29    "from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler\n",
 30    "from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder\n",
 31    "from sklearn.impute import SimpleImputer, KNNImputer\n",
 32    "from sklearn.datasets import load_iris, load_wine\n",
 33    "\n",
 34    "plt.rcParams['font.family'] = 'DejaVu Sans'\n",
 35    "plt.rcParams['axes.unicode_minus'] = False"
 36   ]
 37  },
 38  {
 39   "cell_type": "markdown",
 40   "id": "cell-2",
 41   "metadata": {},
 42   "source": [
 43    "## 1. 결측치 처리 (Handling Missing Values)"
 44   ]
 45  },
 46  {
 47   "cell_type": "code",
 48   "execution_count": null,
 49   "id": "cell-3",
 50   "metadata": {},
 51   "outputs": [],
 52   "source": [
 53    "# 결측치가 있는 샘플 데이터 생성\n",
 54    "np.random.seed(42)\n",
 55    "data = {\n",
 56    "    'age': [25, 30, np.nan, 40, 35, np.nan, 50, 28],\n",
 57    "    'income': [50000, np.nan, 60000, 80000, np.nan, 70000, 90000, 55000],\n",
 58    "    'score': [85, 90, 75, np.nan, 88, 92, np.nan, 78]\n",
 59    "}\n",
 60    "df = pd.DataFrame(data)\n",
 61    "\n",
 62    "print(\"원본 데이터:\")\n",
 63    "print(df)\n",
 64    "print(f\"\\n결측치 개수:\\n{df.isnull().sum()}\")\n",
 65    "print(f\"\\n결측치 비율:\\n{df.isnull().mean() * 100:.2f}%\")"
 66   ]
 67  },
 68  {
 69   "cell_type": "markdown",
 70   "id": "cell-4",
 71   "metadata": {},
 72   "source": [
 73    "### 1.1 SimpleImputer - 기본 대체 전략"
 74   ]
 75  },
 76  {
 77   "cell_type": "code",
 78   "execution_count": null,
 79   "id": "cell-5",
 80   "metadata": {},
 81   "outputs": [],
 82   "source": [
 83    "# 평균값으로 대체\n",
 84    "imputer_mean = SimpleImputer(strategy='mean')\n",
 85    "df_mean = pd.DataFrame(\n",
 86    "    imputer_mean.fit_transform(df),\n",
 87    "    columns=df.columns\n",
 88    ")\n",
 89    "\n",
 90    "# 중앙값으로 대체\n",
 91    "imputer_median = SimpleImputer(strategy='median')\n",
 92    "df_median = pd.DataFrame(\n",
 93    "    imputer_median.fit_transform(df),\n",
 94    "    columns=df.columns\n",
 95    ")\n",
 96    "\n",
 97    "# 최빈값으로 대체\n",
 98    "imputer_frequent = SimpleImputer(strategy='most_frequent')\n",
 99    "df_frequent = pd.DataFrame(\n",
100    "    imputer_frequent.fit_transform(df),\n",
101    "    columns=df.columns\n",
102    ")\n",
103    "\n",
104    "# 상수값으로 대체\n",
105    "imputer_constant = SimpleImputer(strategy='constant', fill_value=0)\n",
106    "df_constant = pd.DataFrame(\n",
107    "    imputer_constant.fit_transform(df),\n",
108    "    columns=df.columns\n",
109    ")\n",
110    "\n",
111    "print(\"평균값 대체:\")\n",
112    "print(df_mean)\n",
113    "print(f\"\\n중앙값 대체 (age 컬럼): {df_median['age'].values}\")\n",
114    "print(f\"최빈값 대체 (age 컬럼): {df_frequent['age'].values}\")"
115   ]
116  },
117  {
118   "cell_type": "markdown",
119   "id": "cell-6",
120   "metadata": {},
121   "source": [
122    "### 1.2 KNNImputer - K-최근접 이웃 대체"
123   ]
124  },
125  {
126   "cell_type": "code",
127   "execution_count": null,
128   "id": "cell-7",
129   "metadata": {},
130   "outputs": [],
131   "source": [
132    "# KNN 기반 결측치 대체\n",
133    "imputer_knn = KNNImputer(n_neighbors=3)\n",
134    "df_knn = pd.DataFrame(\n",
135    "    imputer_knn.fit_transform(df),\n",
136    "    columns=df.columns\n",
137    ")\n",
138    "\n",
139    "print(\"KNN 대체:\")\n",
140    "print(df_knn)\n",
141    "\n",
142    "# 시각화 비교\n",
143    "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n",
144    "\n",
145    "for ax, (method, df_filled) in zip(axes, [\n",
146    "    ('Mean', df_mean), \n",
147    "    ('Median', df_median), \n",
148    "    ('KNN', df_knn)\n",
149    "]):\n",
150    "    ax.scatter(df_filled['age'], df_filled['income'], alpha=0.7, s=100)\n",
151    "    ax.set_xlabel('Age')\n",
152    "    ax.set_ylabel('Income')\n",
153    "    ax.set_title(f'{method} Imputation')\n",
154    "    ax.grid(True, alpha=0.3)\n",
155    "\n",
156    "plt.tight_layout()\n",
157    "plt.show()"
158   ]
159  },
160  {
161   "cell_type": "markdown",
162   "id": "cell-8",
163   "metadata": {},
164   "source": [
165    "## 2. 특성 스케일링 (Feature Scaling)"
166   ]
167  },
168  {
169   "cell_type": "code",
170   "execution_count": null,
171   "id": "cell-9",
172   "metadata": {},
173   "outputs": [],
174   "source": [
175    "# 스케일이 다른 데이터 생성\n",
176    "np.random.seed(42)\n",
177    "data_scale = {\n",
178    "    'age': np.random.randint(20, 60, 100),\n",
179    "    'income': np.random.randint(30000, 150000, 100),\n",
180    "    'score': np.random.uniform(0, 100, 100)\n",
181    "}\n",
182    "df_scale = pd.DataFrame(data_scale)\n",
183    "\n",
184    "print(\"원본 데이터 통계:\")\n",
185    "print(df_scale.describe())"
186   ]
187  },
188  {
189   "cell_type": "markdown",
190   "id": "cell-10",
191   "metadata": {},
192   "source": [
193    "### 2.1 StandardScaler (표준화)"
194   ]
195  },
196  {
197   "cell_type": "code",
198   "execution_count": null,
199   "id": "cell-11",
200   "metadata": {},
201   "outputs": [],
202   "source": [
203    "# StandardScaler: (x - mean) / std\n",
204    "scaler_standard = StandardScaler()\n",
205    "df_standard = pd.DataFrame(\n",
206    "    scaler_standard.fit_transform(df_scale),\n",
207    "    columns=df_scale.columns\n",
208    ")\n",
209    "\n",
210    "print(\"StandardScaler 결과:\")\n",
211    "print(df_standard.describe())\n",
212    "print(f\"\\n평균: {df_standard.mean().values}\")\n",
213    "print(f\"표준편차: {df_standard.std().values}\")"
214   ]
215  },
216  {
217   "cell_type": "markdown",
218   "id": "cell-12",
219   "metadata": {},
220   "source": [
221    "### 2.2 MinMaxScaler (정규화)"
222   ]
223  },
224  {
225   "cell_type": "code",
226   "execution_count": null,
227   "id": "cell-13",
228   "metadata": {},
229   "outputs": [],
230   "source": [
231    "# MinMaxScaler: (x - min) / (max - min)\n",
232    "scaler_minmax = MinMaxScaler(feature_range=(0, 1))\n",
233    "df_minmax = pd.DataFrame(\n",
234    "    scaler_minmax.fit_transform(df_scale),\n",
235    "    columns=df_scale.columns\n",
236    ")\n",
237    "\n",
238    "print(\"MinMaxScaler 결과:\")\n",
239    "print(df_minmax.describe())\n",
240    "print(f\"\\n최솟값: {df_minmax.min().values}\")\n",
241    "print(f\"최댓값: {df_minmax.max().values}\")"
242   ]
243  },
244  {
245   "cell_type": "markdown",
246   "id": "cell-14",
247   "metadata": {},
248   "source": [
249    "### 2.3 RobustScaler (이상치에 강건)"
250   ]
251  },
252  {
253   "cell_type": "code",
254   "execution_count": null,
255   "id": "cell-15",
256   "metadata": {},
257   "outputs": [],
258   "source": [
259    "# RobustScaler: (x - median) / IQR\n",
260    "scaler_robust = RobustScaler()\n",
261    "df_robust = pd.DataFrame(\n",
262    "    scaler_robust.fit_transform(df_scale),\n",
263    "    columns=df_scale.columns\n",
264    ")\n",
265    "\n",
266    "print(\"RobustScaler 결과:\")\n",
267    "print(df_robust.describe())"
268   ]
269  },
270  {
271   "cell_type": "markdown",
272   "id": "cell-16",
273   "metadata": {},
274   "source": [
275    "### 2.4 스케일러 비교 시각화"
276   ]
277  },
278  {
279   "cell_type": "code",
280   "execution_count": null,
281   "id": "cell-17",
282   "metadata": {},
283   "outputs": [],
284   "source": [
285    "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
286    "axes = axes.flatten()\n",
287    "\n",
288    "# 이상치 추가\n",
289    "df_outlier = df_scale.copy()\n",
290    "df_outlier.loc[0, 'income'] = 500000  # 이상치 추가\n",
291    "\n",
292    "scalers = [\n",
293    "    ('Original', df_outlier),\n",
294    "    ('StandardScaler', pd.DataFrame(StandardScaler().fit_transform(df_outlier), columns=df_outlier.columns)),\n",
295    "    ('MinMaxScaler', pd.DataFrame(MinMaxScaler().fit_transform(df_outlier), columns=df_outlier.columns)),\n",
296    "    ('RobustScaler', pd.DataFrame(RobustScaler().fit_transform(df_outlier), columns=df_outlier.columns))\n",
297    "]\n",
298    "\n",
299    "for ax, (name, data) in zip(axes, scalers):\n",
300    "    ax.boxplot([data['age'], data['income'], data['score']], labels=['age', 'income', 'score'])\n",
301    "    ax.set_title(name)\n",
302    "    ax.set_ylabel('Value')\n",
303    "    ax.grid(True, alpha=0.3)\n",
304    "\n",
305    "plt.tight_layout()\n",
306    "plt.show()"
307   ]
308  },
309  {
310   "cell_type": "markdown",
311   "id": "cell-18",
312   "metadata": {},
313   "source": [
314    "## 3. 범주형 변수 인코딩 (Categorical Encoding)"
315   ]
316  },
317  {
318   "cell_type": "code",
319   "execution_count": null,
320   "id": "cell-19",
321   "metadata": {},
322   "outputs": [],
323   "source": [
324    "# 범주형 데이터 샘플\n",
325    "data_cat = {\n",
326    "    'color': ['red', 'blue', 'green', 'red', 'blue', 'green', 'red'],\n",
327    "    'size': ['S', 'M', 'L', 'M', 'S', 'L', 'M'],\n",
328    "    'quality': ['good', 'excellent', 'poor', 'good', 'excellent', 'poor', 'good']\n",
329    "}\n",
330    "df_cat = pd.DataFrame(data_cat)\n",
331    "\n",
332    "print(\"범주형 데이터:\")\n",
333    "print(df_cat)"
334   ]
335  },
336  {
337   "cell_type": "markdown",
338   "id": "cell-20",
339   "metadata": {},
340   "source": [
341    "### 3.1 LabelEncoder (레이블 인코딩)"
342   ]
343  },
344  {
345   "cell_type": "code",
346   "execution_count": null,
347   "id": "cell-21",
348   "metadata": {},
349   "outputs": [],
350   "source": [
351    "# LabelEncoder: 범주를 정수로 변환\n",
352    "le_color = LabelEncoder()\n",
353    "df_cat['color_encoded'] = le_color.fit_transform(df_cat['color'])\n",
354    "\n",
355    "print(\"LabelEncoder 결과:\")\n",
356    "print(df_cat[['color', 'color_encoded']])\n",
357    "print(f\"\\n클래스: {le_color.classes_}\")\n",
358    "print(f\"변환: {dict(zip(le_color.classes_, le_color.transform(le_color.classes_)))}\")"
359   ]
360  },
361  {
362   "cell_type": "markdown",
363   "id": "cell-22",
364   "metadata": {},
365   "source": [
366    "### 3.2 OneHotEncoder (원-핫 인코딩)"
367   ]
368  },
369  {
370   "cell_type": "code",
371   "execution_count": null,
372   "id": "cell-23",
373   "metadata": {},
374   "outputs": [],
375   "source": [
376    "# OneHotEncoder: 범주를 이진 벡터로 변환\n",
377    "ohe = OneHotEncoder(sparse_output=False)\n",
378    "color_onehot = ohe.fit_transform(df_cat[['color']])\n",
379    "\n",
380    "# DataFrame으로 변환\n",
381    "df_onehot = pd.DataFrame(\n",
382    "    color_onehot,\n",
383    "    columns=ohe.get_feature_names_out(['color'])\n",
384    ")\n",
385    "\n",
386    "print(\"OneHotEncoder 결과:\")\n",
387    "print(pd.concat([df_cat['color'], df_onehot], axis=1))"
388   ]
389  },
390  {
391   "cell_type": "markdown",
392   "id": "cell-24",
393   "metadata": {},
394   "source": [
395    "### 3.3 OrdinalEncoder (순서형 인코딩)"
396   ]
397  },
398  {
399   "cell_type": "code",
400   "execution_count": null,
401   "id": "cell-25",
402   "metadata": {},
403   "outputs": [],
404   "source": [
405    "# OrdinalEncoder: 순서가 있는 범주형 변수\n",
406    "oe = OrdinalEncoder(categories=[['poor', 'good', 'excellent']])\n",
407    "df_cat['quality_encoded'] = oe.fit_transform(df_cat[['quality']])\n",
408    "\n",
409    "print(\"OrdinalEncoder 결과:\")\n",
410    "print(df_cat[['quality', 'quality_encoded']])\n",
411    "print(f\"\\n순서: poor(0) < good(1) < excellent(2)\")"
412   ]
413  },
414  {
415   "cell_type": "markdown",
416   "id": "cell-26",
417   "metadata": {},
418   "source": [
419    "### 3.4 Pandas get_dummies"
420   ]
421  },
422  {
423   "cell_type": "code",
424   "execution_count": null,
425   "id": "cell-27",
426   "metadata": {},
427   "outputs": [],
428   "source": [
429    "# pandas의 get_dummies (간편한 원-핫 인코딩)\n",
430    "df_dummies = pd.get_dummies(df_cat[['color', 'size']], prefix=['color', 'size'])\n",
431    "\n",
432    "print(\"pd.get_dummies 결과:\")\n",
433    "print(df_dummies.head())\n",
434    "\n",
435    "# drop_first=True로 다중공선성 방지\n",
436    "df_dummies_drop = pd.get_dummies(df_cat[['color', 'size']], prefix=['color', 'size'], drop_first=True)\n",
437    "print(f\"\\ndrop_first=True (shape: {df_dummies_drop.shape}):\")\n",
438    "print(df_dummies_drop.head())"
439   ]
440  },
441  {
442   "cell_type": "markdown",
443   "id": "cell-28",
444   "metadata": {},
445   "source": [
446    "## 4. 특성 선택 (Feature Selection)"
447   ]
448  },
449  {
450   "cell_type": "code",
451   "execution_count": null,
452   "id": "cell-29",
453   "metadata": {},
454   "outputs": [],
455   "source": [
456    "from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif\n",
457    "from sklearn.feature_selection import RFE\n",
458    "from sklearn.ensemble import RandomForestClassifier\n",
459    "\n",
460    "# Iris 데이터 로드\n",
461    "iris = load_iris()\n",
462    "X, y = iris.data, iris.target\n",
463    "\n",
464    "print(f\"원본 데이터: {X.shape}\")\n",
465    "print(f\"특성 이름: {iris.feature_names}\")"
466   ]
467  },
468  {
469   "cell_type": "markdown",
470   "id": "cell-30",
471   "metadata": {},
472   "source": [
473    "### 4.1 SelectKBest (통계적 선택)"
474   ]
475  },
476  {
477   "cell_type": "code",
478   "execution_count": null,
479   "id": "cell-31",
480   "metadata": {},
481   "outputs": [],
482   "source": [
483    "# F-통계량 기반 선택\n",
484    "selector_f = SelectKBest(score_func=f_classif, k=2)\n",
485    "X_kbest_f = selector_f.fit_transform(X, y)\n",
486    "\n",
487    "# 상호정보량 기반 선택\n",
488    "selector_mi = SelectKBest(score_func=mutual_info_classif, k=2)\n",
489    "X_kbest_mi = selector_mi.fit_transform(X, y)\n",
490    "\n",
491    "print(\"SelectKBest (F-statistic):\")\n",
492    "scores_f = pd.DataFrame({\n",
493    "    'Feature': iris.feature_names,\n",
494    "    'Score': selector_f.scores_\n",
495    "}).sort_values('Score', ascending=False)\n",
496    "print(scores_f)\n",
497    "\n",
498    "print(\"\\nSelectKBest (Mutual Information):\")\n",
499    "scores_mi = pd.DataFrame({\n",
500    "    'Feature': iris.feature_names,\n",
501    "    'Score': selector_mi.scores_\n",
502    "}).sort_values('Score', ascending=False)\n",
503    "print(scores_mi)"
504   ]
505  },
506  {
507   "cell_type": "markdown",
508   "id": "cell-32",
509   "metadata": {},
510   "source": [
511    "### 4.2 RFE (재귀적 특성 제거)"
512   ]
513  },
514  {
515   "cell_type": "code",
516   "execution_count": null,
517   "id": "cell-33",
518   "metadata": {},
519   "outputs": [],
520   "source": [
521    "# RFE with Random Forest\n",
522    "estimator = RandomForestClassifier(n_estimators=50, random_state=42)\n",
523    "selector_rfe = RFE(estimator, n_features_to_select=2, step=1)\n",
524    "X_rfe = selector_rfe.fit_transform(X, y)\n",
525    "\n",
526    "print(\"RFE 결과:\")\n",
527    "rfe_result = pd.DataFrame({\n",
528    "    'Feature': iris.feature_names,\n",
529    "    'Selected': selector_rfe.support_,\n",
530    "    'Ranking': selector_rfe.ranking_\n",
531    "}).sort_values('Ranking')\n",
532    "print(rfe_result)"
533   ]
534  },
535  {
536   "cell_type": "markdown",
537   "id": "cell-34",
538   "metadata": {},
539   "source": [
540    "### 4.3 특성 중요도 (랜덤 포레스트)"
541   ]
542  },
543  {
544   "cell_type": "code",
545   "execution_count": null,
546   "id": "cell-35",
547   "metadata": {},
548   "outputs": [],
549   "source": [
550    "# 랜덤 포레스트 특성 중요도\n",
551    "rf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
552    "rf.fit(X, y)\n",
553    "\n",
554    "importance = pd.DataFrame({\n",
555    "    'Feature': iris.feature_names,\n",
556    "    'Importance': rf.feature_importances_\n",
557    "}).sort_values('Importance', ascending=True)\n",
558    "\n",
559    "plt.figure(figsize=(10, 6))\n",
560    "plt.barh(importance['Feature'], importance['Importance'])\n",
561    "plt.xlabel('Importance')\n",
562    "plt.title('Random Forest Feature Importance - Iris Dataset')\n",
563    "plt.grid(True, alpha=0.3)\n",
564    "plt.tight_layout()\n",
565    "plt.show()"
566   ]
567  },
568  {
569   "cell_type": "markdown",
570   "id": "cell-36",
571   "metadata": {},
572   "source": [
573    "## 5. 불균형 데이터 처리 (Imbalanced Data)"
574   ]
575  },
576  {
577   "cell_type": "code",
578   "execution_count": null,
579   "id": "cell-37",
580   "metadata": {},
581   "outputs": [],
582   "source": [
583    "from sklearn.datasets import make_classification\n",
584    "\n",
585    "# 불균형 데이터 생성 (10:1 비율)\n",
586    "X_imb, y_imb = make_classification(\n",
587    "    n_samples=1000,\n",
588    "    n_features=20,\n",
589    "    n_informative=15,\n",
590    "    n_redundant=5,\n",
591    "    n_classes=2,\n",
592    "    weights=[0.9, 0.1],  # 90% vs 10%\n",
593    "    random_state=42\n",
594    ")\n",
595    "\n",
596    "# 클래스 분포 확인\n",
597    "unique, counts = np.unique(y_imb, return_counts=True)\n",
598    "print(\"클래스 분포:\")\n",
599    "for cls, cnt in zip(unique, counts):\n",
600    "    print(f\"  Class {cls}: {cnt} ({cnt/len(y_imb)*100:.1f}%)\")\n",
601    "\n",
602    "# 시각화\n",
603    "plt.figure(figsize=(8, 5))\n",
604    "plt.bar(['Class 0', 'Class 1'], counts, color=['skyblue', 'salmon'])\n",
605    "plt.ylabel('Count')\n",
606    "plt.title('Imbalanced Dataset Distribution')\n",
607    "plt.grid(True, alpha=0.3)\n",
608    "plt.show()"
609   ]
610  },
611  {
612   "cell_type": "markdown",
613   "id": "cell-38",
614   "metadata": {},
615   "source": [
616    "### 5.1 SMOTE 개념 (이론)"
617   ]
618  },
619  {
620   "cell_type": "code",
621   "execution_count": null,
622   "id": "cell-39",
623   "metadata": {},
624   "outputs": [],
625   "source": [
626    "# SMOTE (Synthetic Minority Over-sampling Technique) 개념 설명\n",
627    "print(\"\"\"\n",
628    "SMOTE 작동 원리:\n",
629    "\n",
630    "1. 소수 클래스의 각 샘플에 대해:\n",
631    "   - K개의 최근접 이웃을 찾음 (보통 k=5)\n",
632    "   \n",
633    "2. 랜덤하게 선택된 이웃과의 선형 보간:\n",
634    "   - new_sample = sample + λ × (neighbor - sample)\n",
635    "   - λ는 0과 1 사이의 랜덤값\n",
636    "   \n",
637    "3. 합성 샘플을 생성하여 소수 클래스 증강\n",
638    "\n",
639    "장점:\n",
640    "- 과적합 위험이 낮음 (단순 복제가 아님)\n",
641    "- 결정 경계가 더 일반화됨\n",
642    "\n",
643    "단점:\n",
644    "- 노이즈에 민감할 수 있음\n",
645    "- 고차원 데이터에서는 효과가 제한적\n",
646    "\n",
647    "사용 방법:\n",
648    "- pip install imbalanced-learn\n",
649    "- from imblearn.over_sampling import SMOTE\n",
650    "- smote = SMOTE(random_state=42)\n",
651    "- X_resampled, y_resampled = smote.fit_resample(X, y)\n",
652    "\"\"\")"
653   ]
654  },
655  {
656   "cell_type": "markdown",
657   "id": "cell-40",
658   "metadata": {},
659   "source": [
660    "### 5.2 클래스 가중치 조정"
661   ]
662  },
663  {
664   "cell_type": "code",
665   "execution_count": null,
666   "id": "cell-41",
667   "metadata": {},
668   "outputs": [],
669   "source": [
670    "from sklearn.linear_model import LogisticRegression\n",
671    "from sklearn.metrics import classification_report, confusion_matrix\n",
672    "\n",
673    "X_train_imb, X_test_imb, y_train_imb, y_test_imb = train_test_split(\n",
674    "    X_imb, y_imb, test_size=0.3, random_state=42\n",
675    ")\n",
676    "\n",
677    "# 가중치 없음\n",
678    "clf_no_weight = LogisticRegression(random_state=42, max_iter=1000)\n",
679    "clf_no_weight.fit(X_train_imb, y_train_imb)\n",
680    "y_pred_no_weight = clf_no_weight.predict(X_test_imb)\n",
681    "\n",
682    "# 가중치 조정 (balanced)\n",
683    "clf_balanced = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)\n",
684    "clf_balanced.fit(X_train_imb, y_train_imb)\n",
685    "y_pred_balanced = clf_balanced.predict(X_test_imb)\n",
686    "\n",
687    "print(\"=== 가중치 없음 ===\")\n",
688    "print(classification_report(y_test_imb, y_pred_no_weight))\n",
689    "\n",
690    "print(\"\\n=== 가중치 조정 (balanced) ===\")\n",
691    "print(classification_report(y_test_imb, y_pred_balanced))"
692   ]
693  },
694  {
695   "cell_type": "markdown",
696   "id": "cell-42",
697   "metadata": {},
698   "source": [
699    "## 6. 실전 전처리 파이프라인"
700   ]
701  },
702  {
703   "cell_type": "code",
704   "execution_count": null,
705   "id": "cell-43",
706   "metadata": {},
707   "outputs": [],
708   "source": [
709    "from sklearn.pipeline import Pipeline\n",
710    "from sklearn.compose import ColumnTransformer\n",
711    "\n",
712    "# 혼합 데이터 생성\n",
713    "data_mixed = {\n",
714    "    'age': [25, np.nan, 35, 40, 30, 45, np.nan, 28],\n",
715    "    'income': [50000, 60000, np.nan, 80000, 70000, 90000, 55000, np.nan],\n",
716    "    'city': ['Seoul', 'Busan', 'Seoul', 'Daegu', 'Busan', 'Seoul', 'Daegu', 'Busan'],\n",
717    "    'education': ['Bachelor', 'Master', 'PhD', 'Bachelor', 'Master', 'PhD', 'Bachelor', 'Master'],\n",
718    "    'purchased': [0, 1, 1, 0, 1, 1, 0, 1]\n",
719    "}\n",
720    "df_mixed = pd.DataFrame(data_mixed)\n",
721    "\n",
722    "X_mixed = df_mixed.drop('purchased', axis=1)\n",
723    "y_mixed = df_mixed['purchased']\n",
724    "\n",
725    "print(\"혼합 데이터:\")\n",
726    "print(df_mixed)"
727   ]
728  },
729  {
730   "cell_type": "code",
731   "execution_count": null,
732   "id": "cell-44",
733   "metadata": {},
734   "outputs": [],
735   "source": [
736    "# 수치형/범주형 특성 분리\n",
737    "numeric_features = ['age', 'income']\n",
738    "categorical_features = ['city', 'education']\n",
739    "\n",
740    "# 수치형 전처리 파이프라인\n",
741    "numeric_transformer = Pipeline(steps=[\n",
742    "    ('imputer', SimpleImputer(strategy='median')),\n",
743    "    ('scaler', StandardScaler())\n",
744    "])\n",
745    "\n",
746    "# 범주형 전처리 파이프라인\n",
747    "categorical_transformer = Pipeline(steps=[\n",
748    "    ('imputer', SimpleImputer(strategy='most_frequent')),\n",
749    "    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))\n",
750    "])\n",
751    "\n",
752    "# ColumnTransformer로 결합\n",
753    "preprocessor = ColumnTransformer(\n",
754    "    transformers=[\n",
755    "        ('num', numeric_transformer, numeric_features),\n",
756    "        ('cat', categorical_transformer, categorical_features)\n",
757    "    ]\n",
758    ")\n",
759    "\n",
760    "# 전체 파이프라인\n",
761    "pipeline = Pipeline(steps=[\n",
762    "    ('preprocessor', preprocessor),\n",
763    "    ('classifier', LogisticRegression(random_state=42))\n",
764    "])\n",
765    "\n",
766    "# 학습 (작은 데이터이므로 전체 사용)\n",
767    "pipeline.fit(X_mixed, y_mixed)\n",
768    "\n",
769    "# 새로운 데이터 예측\n",
770    "new_data = pd.DataFrame({\n",
771    "    'age': [30],\n",
772    "    'income': [70000],\n",
773    "    'city': ['Seoul'],\n",
774    "    'education': ['Master']\n",
775    "})\n",
776    "\n",
777    "prediction = pipeline.predict(new_data)\n",
778    "probability = pipeline.predict_proba(new_data)\n",
779    "\n",
780    "print(f\"\\n예측 결과: {prediction[0]}\")\n",
781    "print(f\"확률: {probability[0]}\")"
782   ]
783  },
784  {
785   "cell_type": "markdown",
786   "id": "cell-45",
787   "metadata": {},
788   "source": [
789    "## 정리\n",
790    "\n",
791    "### 핵심 개념\n",
792    "\n",
793    "**결측치 처리:**\n",
794    "- **SimpleImputer**: 평균, 중앙값, 최빈값, 상수로 대체\n",
795    "- **KNNImputer**: K-최근접 이웃 기반 대체\n",
796    "\n",
797    "**특성 스케일링:**\n",
798    "- **StandardScaler**: 평균 0, 표준편차 1 (정규분포 가정)\n",
799    "- **MinMaxScaler**: 0-1 범위로 정규화\n",
800    "- **RobustScaler**: 중앙값과 IQR 사용 (이상치에 강건)\n",
801    "\n",
802    "**범주형 인코딩:**\n",
803    "- **LabelEncoder**: 순서 없는 분류 (타겟 변수용)\n",
804    "- **OneHotEncoder**: 이진 벡터로 변환 (다중공선성 주의)\n",
805    "- **OrdinalEncoder**: 순서가 있는 범주형\n",
806    "\n",
807    "**불균형 데이터:**\n",
808    "- **SMOTE**: 합성 샘플 생성 (요구사항: imbalanced-learn)\n",
809    "- **class_weight**: 모델 가중치 조정\n",
810    "\n",
811    "### 다음 단계\n",
812    "- Pipeline과 ColumnTransformer 활용\n",
813    "- 교차 검증과 전처리 통합\n",
814    "- 실전 프로젝트 적용"
815   ]
816  }
817 ],
818 "metadata": {
819  "kernelspec": {
820   "display_name": "Python 3",
821   "language": "python",
822   "name": "python3"
823  },
824  "language_info": {
825   "codemirror_mode": {
826    "name": "ipython",
827    "version": 3
828   },
829   "file_extension": ".py",
830   "mimetype": "text/x-python",
831   "name": "python",
832   "nbconvert_exporter": "python",
833   "pygments_lexer": "ipython3",
834   "version": "3.8.0"
835  }
836 },
837 "nbformat": 4,
838 "nbformat_minor": 5
839}