05_preprocessing.ipynb

Download
json 840 lines 23.4 KB
  1{
  2 "cells": [
  3  {
  4   "cell_type": "markdown",
  5   "id": "cell-0",
  6   "metadata": {},
  7   "source": [
  8    "# 05. ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ (Data Preprocessing)\n",
  9    "\n",
 10    "## ํ•™์Šต ๋ชฉํ‘œ\n",
 11    "- ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ ์ „๋žต ์ดํ•ด\n",
 12    "- ํŠน์„ฑ ์Šค์ผ€์ผ๋ง ๋ฐฉ๋ฒ• ๋น„๊ต\n",
 13    "- ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ์ธ์ฝ”๋”ฉ\n",
 14    "- ๋ถˆ๊ท ํ˜• ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ"
 15   ]
 16  },
 17  {
 18   "cell_type": "code",
 19   "execution_count": null,
 20   "id": "cell-1",
 21   "metadata": {},
 22   "outputs": [],
 23   "source": [
 24    "import numpy as np\n",
 25    "import pandas as pd\n",
 26    "import matplotlib.pyplot as plt\n",
 27    "import seaborn as sns\n",
 28    "from sklearn.model_selection import train_test_split\n",
 29    "from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler\n",
 30    "from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder\n",
 31    "from sklearn.impute import SimpleImputer, KNNImputer\n",
 32    "from sklearn.datasets import load_iris, load_wine\n",
 33    "\n",
 34    "plt.rcParams['font.family'] = 'DejaVu Sans'\n",
 35    "plt.rcParams['axes.unicode_minus'] = False"
 36   ]
 37  },
 38  {
 39   "cell_type": "markdown",
 40   "id": "cell-2",
 41   "metadata": {},
 42   "source": [
 43    "## 1. ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ (Handling Missing Values)"
 44   ]
 45  },
 46  {
 47   "cell_type": "code",
 48   "execution_count": null,
 49   "id": "cell-3",
 50   "metadata": {},
 51   "outputs": [],
 52   "source": [
 53    "# ๊ฒฐ์ธก์น˜๊ฐ€ ์žˆ๋Š” ์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ\n",
 54    "np.random.seed(42)\n",
 55    "data = {\n",
 56    "    'age': [25, 30, np.nan, 40, 35, np.nan, 50, 28],\n",
 57    "    'income': [50000, np.nan, 60000, 80000, np.nan, 70000, 90000, 55000],\n",
 58    "    'score': [85, 90, 75, np.nan, 88, 92, np.nan, 78]\n",
 59    "}\n",
 60    "df = pd.DataFrame(data)\n",
 61    "\n",
 62    "print(\"์›๋ณธ ๋ฐ์ดํ„ฐ:\")\n",
 63    "print(df)\n",
 64    "print(f\"\\n๊ฒฐ์ธก์น˜ ๊ฐœ์ˆ˜:\\n{df.isnull().sum()}\")\n",
 65    "print(f\"\\n๊ฒฐ์ธก์น˜ ๋น„์œจ:\\n{df.isnull().mean() * 100:.2f}%\")"
 66   ]
 67  },
 68  {
 69   "cell_type": "markdown",
 70   "id": "cell-4",
 71   "metadata": {},
 72   "source": [
 73    "### 1.1 SimpleImputer - ๊ธฐ๋ณธ ๋Œ€์ฒด ์ „๋žต"
 74   ]
 75  },
 76  {
 77   "cell_type": "code",
 78   "execution_count": null,
 79   "id": "cell-5",
 80   "metadata": {},
 81   "outputs": [],
 82   "source": [
 83    "# ํ‰๊ท ๊ฐ’์œผ๋กœ ๋Œ€์ฒด\n",
 84    "imputer_mean = SimpleImputer(strategy='mean')\n",
 85    "df_mean = pd.DataFrame(\n",
 86    "    imputer_mean.fit_transform(df),\n",
 87    "    columns=df.columns\n",
 88    ")\n",
 89    "\n",
 90    "# ์ค‘์•™๊ฐ’์œผ๋กœ ๋Œ€์ฒด\n",
 91    "imputer_median = SimpleImputer(strategy='median')\n",
 92    "df_median = pd.DataFrame(\n",
 93    "    imputer_median.fit_transform(df),\n",
 94    "    columns=df.columns\n",
 95    ")\n",
 96    "\n",
 97    "# ์ตœ๋นˆ๊ฐ’์œผ๋กœ ๋Œ€์ฒด\n",
 98    "imputer_frequent = SimpleImputer(strategy='most_frequent')\n",
 99    "df_frequent = pd.DataFrame(\n",
100    "    imputer_frequent.fit_transform(df),\n",
101    "    columns=df.columns\n",
102    ")\n",
103    "\n",
104    "# ์ƒ์ˆ˜๊ฐ’์œผ๋กœ ๋Œ€์ฒด\n",
105    "imputer_constant = SimpleImputer(strategy='constant', fill_value=0)\n",
106    "df_constant = pd.DataFrame(\n",
107    "    imputer_constant.fit_transform(df),\n",
108    "    columns=df.columns\n",
109    ")\n",
110    "\n",
111    "print(\"ํ‰๊ท ๊ฐ’ ๋Œ€์ฒด:\")\n",
112    "print(df_mean)\n",
113    "print(f\"\\n์ค‘์•™๊ฐ’ ๋Œ€์ฒด (age ์ปฌ๋Ÿผ): {df_median['age'].values}\")\n",
114    "print(f\"์ตœ๋นˆ๊ฐ’ ๋Œ€์ฒด (age ์ปฌ๋Ÿผ): {df_frequent['age'].values}\")"
115   ]
116  },
117  {
118   "cell_type": "markdown",
119   "id": "cell-6",
120   "metadata": {},
121   "source": [
122    "### 1.2 KNNImputer - K-์ตœ๊ทผ์ ‘ ์ด์›ƒ ๋Œ€์ฒด"
123   ]
124  },
125  {
126   "cell_type": "code",
127   "execution_count": null,
128   "id": "cell-7",
129   "metadata": {},
130   "outputs": [],
131   "source": [
132    "# KNN ๊ธฐ๋ฐ˜ ๊ฒฐ์ธก์น˜ ๋Œ€์ฒด\n",
133    "imputer_knn = KNNImputer(n_neighbors=3)\n",
134    "df_knn = pd.DataFrame(\n",
135    "    imputer_knn.fit_transform(df),\n",
136    "    columns=df.columns\n",
137    ")\n",
138    "\n",
139    "print(\"KNN ๋Œ€์ฒด:\")\n",
140    "print(df_knn)\n",
141    "\n",
142    "# ์‹œ๊ฐํ™” ๋น„๊ต\n",
143    "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n",
144    "\n",
145    "for ax, (method, df_filled) in zip(axes, [\n",
146    "    ('Mean', df_mean), \n",
147    "    ('Median', df_median), \n",
148    "    ('KNN', df_knn)\n",
149    "]):\n",
150    "    ax.scatter(df_filled['age'], df_filled['income'], alpha=0.7, s=100)\n",
151    "    ax.set_xlabel('Age')\n",
152    "    ax.set_ylabel('Income')\n",
153    "    ax.set_title(f'{method} Imputation')\n",
154    "    ax.grid(True, alpha=0.3)\n",
155    "\n",
156    "plt.tight_layout()\n",
157    "plt.show()"
158   ]
159  },
160  {
161   "cell_type": "markdown",
162   "id": "cell-8",
163   "metadata": {},
164   "source": [
165    "## 2. ํŠน์„ฑ ์Šค์ผ€์ผ๋ง (Feature Scaling)"
166   ]
167  },
168  {
169   "cell_type": "code",
170   "execution_count": null,
171   "id": "cell-9",
172   "metadata": {},
173   "outputs": [],
174   "source": [
175    "# ์Šค์ผ€์ผ์ด ๋‹ค๋ฅธ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ\n",
176    "np.random.seed(42)\n",
177    "data_scale = {\n",
178    "    'age': np.random.randint(20, 60, 100),\n",
179    "    'income': np.random.randint(30000, 150000, 100),\n",
180    "    'score': np.random.uniform(0, 100, 100)\n",
181    "}\n",
182    "df_scale = pd.DataFrame(data_scale)\n",
183    "\n",
184    "print(\"์›๋ณธ ๋ฐ์ดํ„ฐ ํ†ต๊ณ„:\")\n",
185    "print(df_scale.describe())"
186   ]
187  },
188  {
189   "cell_type": "markdown",
190   "id": "cell-10",
191   "metadata": {},
192   "source": [
193    "### 2.1 StandardScaler (ํ‘œ์ค€ํ™”)"
194   ]
195  },
196  {
197   "cell_type": "code",
198   "execution_count": null,
199   "id": "cell-11",
200   "metadata": {},
201   "outputs": [],
202   "source": [
203    "# StandardScaler: (x - mean) / std\n",
204    "scaler_standard = StandardScaler()\n",
205    "df_standard = pd.DataFrame(\n",
206    "    scaler_standard.fit_transform(df_scale),\n",
207    "    columns=df_scale.columns\n",
208    ")\n",
209    "\n",
210    "print(\"StandardScaler ๊ฒฐ๊ณผ:\")\n",
211    "print(df_standard.describe())\n",
212    "print(f\"\\nํ‰๊ท : {df_standard.mean().values}\")\n",
213    "print(f\"ํ‘œ์ค€ํŽธ์ฐจ: {df_standard.std().values}\")"
214   ]
215  },
216  {
217   "cell_type": "markdown",
218   "id": "cell-12",
219   "metadata": {},
220   "source": [
221    "### 2.2 MinMaxScaler (์ •๊ทœํ™”)"
222   ]
223  },
224  {
225   "cell_type": "code",
226   "execution_count": null,
227   "id": "cell-13",
228   "metadata": {},
229   "outputs": [],
230   "source": [
231    "# MinMaxScaler: (x - min) / (max - min)\n",
232    "scaler_minmax = MinMaxScaler(feature_range=(0, 1))\n",
233    "df_minmax = pd.DataFrame(\n",
234    "    scaler_minmax.fit_transform(df_scale),\n",
235    "    columns=df_scale.columns\n",
236    ")\n",
237    "\n",
238    "print(\"MinMaxScaler ๊ฒฐ๊ณผ:\")\n",
239    "print(df_minmax.describe())\n",
240    "print(f\"\\n์ตœ์†Ÿ๊ฐ’: {df_minmax.min().values}\")\n",
241    "print(f\"์ตœ๋Œ“๊ฐ’: {df_minmax.max().values}\")"
242   ]
243  },
244  {
245   "cell_type": "markdown",
246   "id": "cell-14",
247   "metadata": {},
248   "source": [
249    "### 2.3 RobustScaler (์ด์ƒ์น˜์— ๊ฐ•๊ฑด)"
250   ]
251  },
252  {
253   "cell_type": "code",
254   "execution_count": null,
255   "id": "cell-15",
256   "metadata": {},
257   "outputs": [],
258   "source": [
259    "# RobustScaler: (x - median) / IQR\n",
260    "scaler_robust = RobustScaler()\n",
261    "df_robust = pd.DataFrame(\n",
262    "    scaler_robust.fit_transform(df_scale),\n",
263    "    columns=df_scale.columns\n",
264    ")\n",
265    "\n",
266    "print(\"RobustScaler ๊ฒฐ๊ณผ:\")\n",
267    "print(df_robust.describe())"
268   ]
269  },
270  {
271   "cell_type": "markdown",
272   "id": "cell-16",
273   "metadata": {},
274   "source": [
275    "### 2.4 ์Šค์ผ€์ผ๋Ÿฌ ๋น„๊ต ์‹œ๊ฐํ™”"
276   ]
277  },
278  {
279   "cell_type": "code",
280   "execution_count": null,
281   "id": "cell-17",
282   "metadata": {},
283   "outputs": [],
284   "source": [
285    "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
286    "axes = axes.flatten()\n",
287    "\n",
288    "# ์ด์ƒ์น˜ ์ถ”๊ฐ€\n",
289    "df_outlier = df_scale.copy()\n",
290    "df_outlier.loc[0, 'income'] = 500000  # ์ด์ƒ์น˜ ์ถ”๊ฐ€\n",
291    "\n",
292    "scalers = [\n",
293    "    ('Original', df_outlier),\n",
294    "    ('StandardScaler', pd.DataFrame(StandardScaler().fit_transform(df_outlier), columns=df_outlier.columns)),\n",
295    "    ('MinMaxScaler', pd.DataFrame(MinMaxScaler().fit_transform(df_outlier), columns=df_outlier.columns)),\n",
296    "    ('RobustScaler', pd.DataFrame(RobustScaler().fit_transform(df_outlier), columns=df_outlier.columns))\n",
297    "]\n",
298    "\n",
299    "for ax, (name, data) in zip(axes, scalers):\n",
300    "    ax.boxplot([data['age'], data['income'], data['score']], labels=['age', 'income', 'score'])\n",
301    "    ax.set_title(name)\n",
302    "    ax.set_ylabel('Value')\n",
303    "    ax.grid(True, alpha=0.3)\n",
304    "\n",
305    "plt.tight_layout()\n",
306    "plt.show()"
307   ]
308  },
309  {
310   "cell_type": "markdown",
311   "id": "cell-18",
312   "metadata": {},
313   "source": [
314    "## 3. ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ์ธ์ฝ”๋”ฉ (Categorical Encoding)"
315   ]
316  },
317  {
318   "cell_type": "code",
319   "execution_count": null,
320   "id": "cell-19",
321   "metadata": {},
322   "outputs": [],
323   "source": [
324    "# ๋ฒ”์ฃผํ˜• ๋ฐ์ดํ„ฐ ์ƒ˜ํ”Œ\n",
325    "data_cat = {\n",
326    "    'color': ['red', 'blue', 'green', 'red', 'blue', 'green', 'red'],\n",
327    "    'size': ['S', 'M', 'L', 'M', 'S', 'L', 'M'],\n",
328    "    'quality': ['good', 'excellent', 'poor', 'good', 'excellent', 'poor', 'good']\n",
329    "}\n",
330    "df_cat = pd.DataFrame(data_cat)\n",
331    "\n",
332    "print(\"๋ฒ”์ฃผํ˜• ๋ฐ์ดํ„ฐ:\")\n",
333    "print(df_cat)"
334   ]
335  },
336  {
337   "cell_type": "markdown",
338   "id": "cell-20",
339   "metadata": {},
340   "source": [
341    "### 3.1 LabelEncoder (๋ ˆ์ด๋ธ” ์ธ์ฝ”๋”ฉ)"
342   ]
343  },
344  {
345   "cell_type": "code",
346   "execution_count": null,
347   "id": "cell-21",
348   "metadata": {},
349   "outputs": [],
350   "source": [
351    "# LabelEncoder: ๋ฒ”์ฃผ๋ฅผ ์ •์ˆ˜๋กœ ๋ณ€ํ™˜\n",
352    "le_color = LabelEncoder()\n",
353    "df_cat['color_encoded'] = le_color.fit_transform(df_cat['color'])\n",
354    "\n",
355    "print(\"LabelEncoder ๊ฒฐ๊ณผ:\")\n",
356    "print(df_cat[['color', 'color_encoded']])\n",
357    "print(f\"\\nํด๋ž˜์Šค: {le_color.classes_}\")\n",
358    "print(f\"๋ณ€ํ™˜: {dict(zip(le_color.classes_, le_color.transform(le_color.classes_)))}\")"
359   ]
360  },
361  {
362   "cell_type": "markdown",
363   "id": "cell-22",
364   "metadata": {},
365   "source": [
366    "### 3.2 OneHotEncoder (์›-ํ•ซ ์ธ์ฝ”๋”ฉ)"
367   ]
368  },
369  {
370   "cell_type": "code",
371   "execution_count": null,
372   "id": "cell-23",
373   "metadata": {},
374   "outputs": [],
375   "source": [
376    "# OneHotEncoder: ๋ฒ”์ฃผ๋ฅผ ์ด์ง„ ๋ฒกํ„ฐ๋กœ ๋ณ€ํ™˜\n",
377    "ohe = OneHotEncoder(sparse_output=False)\n",
378    "color_onehot = ohe.fit_transform(df_cat[['color']])\n",
379    "\n",
380    "# DataFrame์œผ๋กœ ๋ณ€ํ™˜\n",
381    "df_onehot = pd.DataFrame(\n",
382    "    color_onehot,\n",
383    "    columns=ohe.get_feature_names_out(['color'])\n",
384    ")\n",
385    "\n",
386    "print(\"OneHotEncoder ๊ฒฐ๊ณผ:\")\n",
387    "print(pd.concat([df_cat['color'], df_onehot], axis=1))"
388   ]
389  },
390  {
391   "cell_type": "markdown",
392   "id": "cell-24",
393   "metadata": {},
394   "source": [
395    "### 3.3 OrdinalEncoder (์ˆœ์„œํ˜• ์ธ์ฝ”๋”ฉ)"
396   ]
397  },
398  {
399   "cell_type": "code",
400   "execution_count": null,
401   "id": "cell-25",
402   "metadata": {},
403   "outputs": [],
404   "source": [
405    "# OrdinalEncoder: ์ˆœ์„œ๊ฐ€ ์žˆ๋Š” ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜\n",
406    "oe = OrdinalEncoder(categories=[['poor', 'good', 'excellent']])\n",
407    "df_cat['quality_encoded'] = oe.fit_transform(df_cat[['quality']])\n",
408    "\n",
409    "print(\"OrdinalEncoder ๊ฒฐ๊ณผ:\")\n",
410    "print(df_cat[['quality', 'quality_encoded']])\n",
411    "print(f\"\\n์ˆœ์„œ: poor(0) < good(1) < excellent(2)\")"
412   ]
413  },
414  {
415   "cell_type": "markdown",
416   "id": "cell-26",
417   "metadata": {},
418   "source": [
419    "### 3.4 Pandas get_dummies"
420   ]
421  },
422  {
423   "cell_type": "code",
424   "execution_count": null,
425   "id": "cell-27",
426   "metadata": {},
427   "outputs": [],
428   "source": [
429    "# pandas์˜ get_dummies (๊ฐ„ํŽธํ•œ ์›-ํ•ซ ์ธ์ฝ”๋”ฉ)\n",
430    "df_dummies = pd.get_dummies(df_cat[['color', 'size']], prefix=['color', 'size'])\n",
431    "\n",
432    "print(\"pd.get_dummies ๊ฒฐ๊ณผ:\")\n",
433    "print(df_dummies.head())\n",
434    "\n",
435    "# drop_first=True๋กœ ๋‹ค์ค‘๊ณต์„ ์„ฑ ๋ฐฉ์ง€\n",
436    "df_dummies_drop = pd.get_dummies(df_cat[['color', 'size']], prefix=['color', 'size'], drop_first=True)\n",
437    "print(f\"\\ndrop_first=True (shape: {df_dummies_drop.shape}):\")\n",
438    "print(df_dummies_drop.head())"
439   ]
440  },
441  {
442   "cell_type": "markdown",
443   "id": "cell-28",
444   "metadata": {},
445   "source": [
446    "## 4. ํŠน์„ฑ ์„ ํƒ (Feature Selection)"
447   ]
448  },
449  {
450   "cell_type": "code",
451   "execution_count": null,
452   "id": "cell-29",
453   "metadata": {},
454   "outputs": [],
455   "source": [
456    "from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif\n",
457    "from sklearn.feature_selection import RFE\n",
458    "from sklearn.ensemble import RandomForestClassifier\n",
459    "\n",
460    "# Iris ๋ฐ์ดํ„ฐ ๋กœ๋“œ\n",
461    "iris = load_iris()\n",
462    "X, y = iris.data, iris.target\n",
463    "\n",
464    "print(f\"์›๋ณธ ๋ฐ์ดํ„ฐ: {X.shape}\")\n",
465    "print(f\"ํŠน์„ฑ ์ด๋ฆ„: {iris.feature_names}\")"
466   ]
467  },
468  {
469   "cell_type": "markdown",
470   "id": "cell-30",
471   "metadata": {},
472   "source": [
473    "### 4.1 SelectKBest (ํ†ต๊ณ„์  ์„ ํƒ)"
474   ]
475  },
476  {
477   "cell_type": "code",
478   "execution_count": null,
479   "id": "cell-31",
480   "metadata": {},
481   "outputs": [],
482   "source": [
483    "# F-ํ†ต๊ณ„๋Ÿ‰ ๊ธฐ๋ฐ˜ ์„ ํƒ\n",
484    "selector_f = SelectKBest(score_func=f_classif, k=2)\n",
485    "X_kbest_f = selector_f.fit_transform(X, y)\n",
486    "\n",
487    "# ์ƒํ˜ธ์ •๋ณด๋Ÿ‰ ๊ธฐ๋ฐ˜ ์„ ํƒ\n",
488    "selector_mi = SelectKBest(score_func=mutual_info_classif, k=2)\n",
489    "X_kbest_mi = selector_mi.fit_transform(X, y)\n",
490    "\n",
491    "print(\"SelectKBest (F-statistic):\")\n",
492    "scores_f = pd.DataFrame({\n",
493    "    'Feature': iris.feature_names,\n",
494    "    'Score': selector_f.scores_\n",
495    "}).sort_values('Score', ascending=False)\n",
496    "print(scores_f)\n",
497    "\n",
498    "print(\"\\nSelectKBest (Mutual Information):\")\n",
499    "scores_mi = pd.DataFrame({\n",
500    "    'Feature': iris.feature_names,\n",
501    "    'Score': selector_mi.scores_\n",
502    "}).sort_values('Score', ascending=False)\n",
503    "print(scores_mi)"
504   ]
505  },
506  {
507   "cell_type": "markdown",
508   "id": "cell-32",
509   "metadata": {},
510   "source": [
511    "### 4.2 RFE (์žฌ๊ท€์  ํŠน์„ฑ ์ œ๊ฑฐ)"
512   ]
513  },
514  {
515   "cell_type": "code",
516   "execution_count": null,
517   "id": "cell-33",
518   "metadata": {},
519   "outputs": [],
520   "source": [
521    "# RFE with Random Forest\n",
522    "estimator = RandomForestClassifier(n_estimators=50, random_state=42)\n",
523    "selector_rfe = RFE(estimator, n_features_to_select=2, step=1)\n",
524    "X_rfe = selector_rfe.fit_transform(X, y)\n",
525    "\n",
526    "print(\"RFE ๊ฒฐ๊ณผ:\")\n",
527    "rfe_result = pd.DataFrame({\n",
528    "    'Feature': iris.feature_names,\n",
529    "    'Selected': selector_rfe.support_,\n",
530    "    'Ranking': selector_rfe.ranking_\n",
531    "}).sort_values('Ranking')\n",
532    "print(rfe_result)"
533   ]
534  },
535  {
536   "cell_type": "markdown",
537   "id": "cell-34",
538   "metadata": {},
539   "source": [
540    "### 4.3 ํŠน์„ฑ ์ค‘์š”๋„ (๋žœ๋ค ํฌ๋ ˆ์ŠคํŠธ)"
541   ]
542  },
543  {
544   "cell_type": "code",
545   "execution_count": null,
546   "id": "cell-35",
547   "metadata": {},
548   "outputs": [],
549   "source": [
550    "# ๋žœ๋ค ํฌ๋ ˆ์ŠคํŠธ ํŠน์„ฑ ์ค‘์š”๋„\n",
551    "rf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
552    "rf.fit(X, y)\n",
553    "\n",
554    "importance = pd.DataFrame({\n",
555    "    'Feature': iris.feature_names,\n",
556    "    'Importance': rf.feature_importances_\n",
557    "}).sort_values('Importance', ascending=True)\n",
558    "\n",
559    "plt.figure(figsize=(10, 6))\n",
560    "plt.barh(importance['Feature'], importance['Importance'])\n",
561    "plt.xlabel('Importance')\n",
562    "plt.title('Random Forest Feature Importance - Iris Dataset')\n",
563    "plt.grid(True, alpha=0.3)\n",
564    "plt.tight_layout()\n",
565    "plt.show()"
566   ]
567  },
568  {
569   "cell_type": "markdown",
570   "id": "cell-36",
571   "metadata": {},
572   "source": [
573    "## 5. ๋ถˆ๊ท ํ˜• ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ (Imbalanced Data)"
574   ]
575  },
576  {
577   "cell_type": "code",
578   "execution_count": null,
579   "id": "cell-37",
580   "metadata": {},
581   "outputs": [],
582   "source": [
583    "from sklearn.datasets import make_classification\n",
584    "\n",
585    "# ๋ถˆ๊ท ํ˜• ๋ฐ์ดํ„ฐ ์ƒ์„ฑ (10:1 ๋น„์œจ)\n",
586    "X_imb, y_imb = make_classification(\n",
587    "    n_samples=1000,\n",
588    "    n_features=20,\n",
589    "    n_informative=15,\n",
590    "    n_redundant=5,\n",
591    "    n_classes=2,\n",
592    "    weights=[0.9, 0.1],  # 90% vs 10%\n",
593    "    random_state=42\n",
594    ")\n",
595    "\n",
596    "# ํด๋ž˜์Šค ๋ถ„ํฌ ํ™•์ธ\n",
597    "unique, counts = np.unique(y_imb, return_counts=True)\n",
598    "print(\"ํด๋ž˜์Šค ๋ถ„ํฌ:\")\n",
599    "for cls, cnt in zip(unique, counts):\n",
600    "    print(f\"  Class {cls}: {cnt} ({cnt/len(y_imb)*100:.1f}%)\")\n",
601    "\n",
602    "# ์‹œ๊ฐํ™”\n",
603    "plt.figure(figsize=(8, 5))\n",
604    "plt.bar(['Class 0', 'Class 1'], counts, color=['skyblue', 'salmon'])\n",
605    "plt.ylabel('Count')\n",
606    "plt.title('Imbalanced Dataset Distribution')\n",
607    "plt.grid(True, alpha=0.3)\n",
608    "plt.show()"
609   ]
610  },
611  {
612   "cell_type": "markdown",
613   "id": "cell-38",
614   "metadata": {},
615   "source": [
616    "### 5.1 SMOTE ๊ฐœ๋… (์ด๋ก )"
617   ]
618  },
619  {
620   "cell_type": "code",
621   "execution_count": null,
622   "id": "cell-39",
623   "metadata": {},
624   "outputs": [],
625   "source": [
626    "# SMOTE (Synthetic Minority Over-sampling Technique) ๊ฐœ๋… ์„ค๋ช…\n",
627    "print(\"\"\"\n",
628    "SMOTE ์ž‘๋™ ์›๋ฆฌ:\n",
629    "\n",
630    "1. ์†Œ์ˆ˜ ํด๋ž˜์Šค์˜ ๊ฐ ์ƒ˜ํ”Œ์— ๋Œ€ํ•ด:\n",
631    "   - K๊ฐœ์˜ ์ตœ๊ทผ์ ‘ ์ด์›ƒ์„ ์ฐพ์Œ (๋ณดํ†ต k=5)\n",
632    "   \n",
633    "2. ๋žœ๋คํ•˜๊ฒŒ ์„ ํƒ๋œ ์ด์›ƒ๊ณผ์˜ ์„ ํ˜• ๋ณด๊ฐ„:\n",
634    "   - new_sample = sample + ฮป ร— (neighbor - sample)\n",
635    "   - ฮป๋Š” 0๊ณผ 1 ์‚ฌ์ด์˜ ๋žœ๋ค๊ฐ’\n",
636    "   \n",
637    "3. ํ•ฉ์„ฑ ์ƒ˜ํ”Œ์„ ์ƒ์„ฑํ•˜์—ฌ ์†Œ์ˆ˜ ํด๋ž˜์Šค ์ฆ๊ฐ•\n",
638    "\n",
639    "์žฅ์ :\n",
640    "- ๊ณผ์ ํ•ฉ ์œ„ํ—˜์ด ๋‚ฎ์Œ (๋‹จ์ˆœ ๋ณต์ œ๊ฐ€ ์•„๋‹˜)\n",
641    "- ๊ฒฐ์ • ๊ฒฝ๊ณ„๊ฐ€ ๋” ์ผ๋ฐ˜ํ™”๋จ\n",
642    "\n",
643    "๋‹จ์ :\n",
644    "- ๋…ธ์ด์ฆˆ์— ๋ฏผ๊ฐํ•  ์ˆ˜ ์žˆ์Œ\n",
645    "- ๊ณ ์ฐจ์› ๋ฐ์ดํ„ฐ์—์„œ๋Š” ํšจ๊ณผ๊ฐ€ ์ œํ•œ์ \n",
646    "\n",
647    "์‚ฌ์šฉ ๋ฐฉ๋ฒ•:\n",
648    "- pip install imbalanced-learn\n",
649    "- from imblearn.over_sampling import SMOTE\n",
650    "- smote = SMOTE(random_state=42)\n",
651    "- X_resampled, y_resampled = smote.fit_resample(X, y)\n",
652    "\"\"\")"
653   ]
654  },
655  {
656   "cell_type": "markdown",
657   "id": "cell-40",
658   "metadata": {},
659   "source": [
660    "### 5.2 ํด๋ž˜์Šค ๊ฐ€์ค‘์น˜ ์กฐ์ •"
661   ]
662  },
663  {
664   "cell_type": "code",
665   "execution_count": null,
666   "id": "cell-41",
667   "metadata": {},
668   "outputs": [],
669   "source": [
670    "from sklearn.linear_model import LogisticRegression\n",
671    "from sklearn.metrics import classification_report, confusion_matrix\n",
672    "\n",
673    "X_train_imb, X_test_imb, y_train_imb, y_test_imb = train_test_split(\n",
674    "    X_imb, y_imb, test_size=0.3, random_state=42\n",
675    ")\n",
676    "\n",
677    "# ๊ฐ€์ค‘์น˜ ์—†์Œ\n",
678    "clf_no_weight = LogisticRegression(random_state=42, max_iter=1000)\n",
679    "clf_no_weight.fit(X_train_imb, y_train_imb)\n",
680    "y_pred_no_weight = clf_no_weight.predict(X_test_imb)\n",
681    "\n",
682    "# ๊ฐ€์ค‘์น˜ ์กฐ์ • (balanced)\n",
683    "clf_balanced = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)\n",
684    "clf_balanced.fit(X_train_imb, y_train_imb)\n",
685    "y_pred_balanced = clf_balanced.predict(X_test_imb)\n",
686    "\n",
687    "print(\"=== ๊ฐ€์ค‘์น˜ ์—†์Œ ===\")\n",
688    "print(classification_report(y_test_imb, y_pred_no_weight))\n",
689    "\n",
690    "print(\"\\n=== ๊ฐ€์ค‘์น˜ ์กฐ์ • (balanced) ===\")\n",
691    "print(classification_report(y_test_imb, y_pred_balanced))"
692   ]
693  },
694  {
695   "cell_type": "markdown",
696   "id": "cell-42",
697   "metadata": {},
698   "source": [
699    "## 6. ์‹ค์ „ ์ „์ฒ˜๋ฆฌ ํŒŒ์ดํ”„๋ผ์ธ"
700   ]
701  },
702  {
703   "cell_type": "code",
704   "execution_count": null,
705   "id": "cell-43",
706   "metadata": {},
707   "outputs": [],
708   "source": [
709    "from sklearn.pipeline import Pipeline\n",
710    "from sklearn.compose import ColumnTransformer\n",
711    "\n",
712    "# ํ˜ผํ•ฉ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ\n",
713    "data_mixed = {\n",
714    "    'age': [25, np.nan, 35, 40, 30, 45, np.nan, 28],\n",
715    "    'income': [50000, 60000, np.nan, 80000, 70000, 90000, 55000, np.nan],\n",
716    "    'city': ['Seoul', 'Busan', 'Seoul', 'Daegu', 'Busan', 'Seoul', 'Daegu', 'Busan'],\n",
717    "    'education': ['Bachelor', 'Master', 'PhD', 'Bachelor', 'Master', 'PhD', 'Bachelor', 'Master'],\n",
718    "    'purchased': [0, 1, 1, 0, 1, 1, 0, 1]\n",
719    "}\n",
720    "df_mixed = pd.DataFrame(data_mixed)\n",
721    "\n",
722    "X_mixed = df_mixed.drop('purchased', axis=1)\n",
723    "y_mixed = df_mixed['purchased']\n",
724    "\n",
725    "print(\"ํ˜ผํ•ฉ ๋ฐ์ดํ„ฐ:\")\n",
726    "print(df_mixed)"
727   ]
728  },
729  {
730   "cell_type": "code",
731   "execution_count": null,
732   "id": "cell-44",
733   "metadata": {},
734   "outputs": [],
735   "source": [
736    "# ์ˆ˜์น˜ํ˜•/๋ฒ”์ฃผํ˜• ํŠน์„ฑ ๋ถ„๋ฆฌ\n",
737    "numeric_features = ['age', 'income']\n",
738    "categorical_features = ['city', 'education']\n",
739    "\n",
740    "# ์ˆ˜์น˜ํ˜• ์ „์ฒ˜๋ฆฌ ํŒŒ์ดํ”„๋ผ์ธ\n",
741    "numeric_transformer = Pipeline(steps=[\n",
742    "    ('imputer', SimpleImputer(strategy='median')),\n",
743    "    ('scaler', StandardScaler())\n",
744    "])\n",
745    "\n",
746    "# ๋ฒ”์ฃผํ˜• ์ „์ฒ˜๋ฆฌ ํŒŒ์ดํ”„๋ผ์ธ\n",
747    "categorical_transformer = Pipeline(steps=[\n",
748    "    ('imputer', SimpleImputer(strategy='most_frequent')),\n",
749    "    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))\n",
750    "])\n",
751    "\n",
752    "# ColumnTransformer๋กœ ๊ฒฐํ•ฉ\n",
753    "preprocessor = ColumnTransformer(\n",
754    "    transformers=[\n",
755    "        ('num', numeric_transformer, numeric_features),\n",
756    "        ('cat', categorical_transformer, categorical_features)\n",
757    "    ]\n",
758    ")\n",
759    "\n",
760    "# ์ „์ฒด ํŒŒ์ดํ”„๋ผ์ธ\n",
761    "pipeline = Pipeline(steps=[\n",
762    "    ('preprocessor', preprocessor),\n",
763    "    ('classifier', LogisticRegression(random_state=42))\n",
764    "])\n",
765    "\n",
766    "# ํ•™์Šต (์ž‘์€ ๋ฐ์ดํ„ฐ์ด๋ฏ€๋กœ ์ „์ฒด ์‚ฌ์šฉ)\n",
767    "pipeline.fit(X_mixed, y_mixed)\n",
768    "\n",
769    "# ์ƒˆ๋กœ์šด ๋ฐ์ดํ„ฐ ์˜ˆ์ธก\n",
770    "new_data = pd.DataFrame({\n",
771    "    'age': [30],\n",
772    "    'income': [70000],\n",
773    "    'city': ['Seoul'],\n",
774    "    'education': ['Master']\n",
775    "})\n",
776    "\n",
777    "prediction = pipeline.predict(new_data)\n",
778    "probability = pipeline.predict_proba(new_data)\n",
779    "\n",
780    "print(f\"\\n์˜ˆ์ธก ๊ฒฐ๊ณผ: {prediction[0]}\")\n",
781    "print(f\"ํ™•๋ฅ : {probability[0]}\")"
782   ]
783  },
784  {
785   "cell_type": "markdown",
786   "id": "cell-45",
787   "metadata": {},
788   "source": [
789    "## ์ •๋ฆฌ\n",
790    "\n",
791    "### ํ•ต์‹ฌ ๊ฐœ๋…\n",
792    "\n",
793    "**๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ:**\n",
794    "- **SimpleImputer**: ํ‰๊ท , ์ค‘์•™๊ฐ’, ์ตœ๋นˆ๊ฐ’, ์ƒ์ˆ˜๋กœ ๋Œ€์ฒด\n",
795    "- **KNNImputer**: K-์ตœ๊ทผ์ ‘ ์ด์›ƒ ๊ธฐ๋ฐ˜ ๋Œ€์ฒด\n",
796    "\n",
797    "**ํŠน์„ฑ ์Šค์ผ€์ผ๋ง:**\n",
798    "- **StandardScaler**: ํ‰๊ท  0, ํ‘œ์ค€ํŽธ์ฐจ 1 (์ •๊ทœ๋ถ„ํฌ ๊ฐ€์ •)\n",
799    "- **MinMaxScaler**: 0-1 ๋ฒ”์œ„๋กœ ์ •๊ทœํ™”\n",
800    "- **RobustScaler**: ์ค‘์•™๊ฐ’๊ณผ IQR ์‚ฌ์šฉ (์ด์ƒ์น˜์— ๊ฐ•๊ฑด)\n",
801    "\n",
802    "**๋ฒ”์ฃผํ˜• ์ธ์ฝ”๋”ฉ:**\n",
803    "- **LabelEncoder**: ์ˆœ์„œ ์—†๋Š” ๋ถ„๋ฅ˜ (ํƒ€๊ฒŸ ๋ณ€์ˆ˜์šฉ)\n",
804    "- **OneHotEncoder**: ์ด์ง„ ๋ฒกํ„ฐ๋กœ ๋ณ€ํ™˜ (๋‹ค์ค‘๊ณต์„ ์„ฑ ์ฃผ์˜)\n",
805    "- **OrdinalEncoder**: ์ˆœ์„œ๊ฐ€ ์žˆ๋Š” ๋ฒ”์ฃผํ˜•\n",
806    "\n",
807    "**๋ถˆ๊ท ํ˜• ๋ฐ์ดํ„ฐ:**\n",
808    "- **SMOTE**: ํ•ฉ์„ฑ ์ƒ˜ํ”Œ ์ƒ์„ฑ (์š”๊ตฌ์‚ฌํ•ญ: imbalanced-learn)\n",
809    "- **class_weight**: ๋ชจ๋ธ ๊ฐ€์ค‘์น˜ ์กฐ์ •\n",
810    "\n",
811    "### ๋‹ค์Œ ๋‹จ๊ณ„\n",
812    "- Pipeline๊ณผ ColumnTransformer ํ™œ์šฉ\n",
813    "- ๊ต์ฐจ ๊ฒ€์ฆ๊ณผ ์ „์ฒ˜๋ฆฌ ํ†ตํ•ฉ\n",
814    "- ์‹ค์ „ ํ”„๋กœ์ ํŠธ ์ ์šฉ"
815   ]
816  }
817 ],
818 "metadata": {
819  "kernelspec": {
820   "display_name": "Python 3",
821   "language": "python",
822   "name": "python3"
823  },
824  "language_info": {
825   "codemirror_mode": {
826    "name": "ipython",
827    "version": 3
828   },
829   "file_extension": ".py",
830   "mimetype": "text/x-python",
831   "name": "python",
832   "nbconvert_exporter": "python",
833   "pygments_lexer": "ipython3",
834   "version": "3.8.0"
835  }
836 },
837 "nbformat": 4,
838 "nbformat_minor": 5
839}