1{
2 "cells": [
3 {
4 "cell_type": "markdown",
5 "id": "cell-0",
6 "metadata": {},
7 "source": [
8 "# 05. ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ (Data Preprocessing)\n",
9 "\n",
10 "## ํ์ต ๋ชฉํ\n",
11 "- ๊ฒฐ์ธก์น ์ฒ๋ฆฌ ์ ๋ต ์ดํด\n",
12 "- ํน์ฑ ์ค์ผ์ผ๋ง ๋ฐฉ๋ฒ ๋น๊ต\n",
13 "- ๋ฒ์ฃผํ ๋ณ์ ์ธ์ฝ๋ฉ\n",
14 "- ๋ถ๊ท ํ ๋ฐ์ดํฐ ์ฒ๋ฆฌ"
15 ]
16 },
17 {
18 "cell_type": "code",
19 "execution_count": null,
20 "id": "cell-1",
21 "metadata": {},
22 "outputs": [],
23 "source": [
24 "import numpy as np\n",
25 "import pandas as pd\n",
26 "import matplotlib.pyplot as plt\n",
27 "import seaborn as sns\n",
28 "from sklearn.model_selection import train_test_split\n",
29 "from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler\n",
30 "from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder\n",
31 "from sklearn.impute import SimpleImputer, KNNImputer\n",
32 "from sklearn.datasets import load_iris, load_wine\n",
33 "\n",
34 "plt.rcParams['font.family'] = 'DejaVu Sans'\n",
35 "plt.rcParams['axes.unicode_minus'] = False"
36 ]
37 },
38 {
39 "cell_type": "markdown",
40 "id": "cell-2",
41 "metadata": {},
42 "source": [
43 "## 1. ๊ฒฐ์ธก์น ์ฒ๋ฆฌ (Handling Missing Values)"
44 ]
45 },
46 {
47 "cell_type": "code",
48 "execution_count": null,
49 "id": "cell-3",
50 "metadata": {},
51 "outputs": [],
52 "source": [
53 "# ๊ฒฐ์ธก์น๊ฐ ์๋ ์ํ ๋ฐ์ดํฐ ์์ฑ\n",
54 "np.random.seed(42)\n",
55 "data = {\n",
56 " 'age': [25, 30, np.nan, 40, 35, np.nan, 50, 28],\n",
57 " 'income': [50000, np.nan, 60000, 80000, np.nan, 70000, 90000, 55000],\n",
58 " 'score': [85, 90, 75, np.nan, 88, 92, np.nan, 78]\n",
59 "}\n",
60 "df = pd.DataFrame(data)\n",
61 "\n",
62 "print(\"์๋ณธ ๋ฐ์ดํฐ:\")\n",
63 "print(df)\n",
64 "print(f\"\\n๊ฒฐ์ธก์น ๊ฐ์:\\n{df.isnull().sum()}\")\n",
65 "print(f\"\\n๊ฒฐ์ธก์น ๋น์จ:\\n{df.isnull().mean() * 100:.2f}%\")"
66 ]
67 },
68 {
69 "cell_type": "markdown",
70 "id": "cell-4",
71 "metadata": {},
72 "source": [
73 "### 1.1 SimpleImputer - ๊ธฐ๋ณธ ๋์ฒด ์ ๋ต"
74 ]
75 },
76 {
77 "cell_type": "code",
78 "execution_count": null,
79 "id": "cell-5",
80 "metadata": {},
81 "outputs": [],
82 "source": [
83 "# ํ๊ท ๊ฐ์ผ๋ก ๋์ฒด\n",
84 "imputer_mean = SimpleImputer(strategy='mean')\n",
85 "df_mean = pd.DataFrame(\n",
86 " imputer_mean.fit_transform(df),\n",
87 " columns=df.columns\n",
88 ")\n",
89 "\n",
90 "# ์ค์๊ฐ์ผ๋ก ๋์ฒด\n",
91 "imputer_median = SimpleImputer(strategy='median')\n",
92 "df_median = pd.DataFrame(\n",
93 " imputer_median.fit_transform(df),\n",
94 " columns=df.columns\n",
95 ")\n",
96 "\n",
97 "# ์ต๋น๊ฐ์ผ๋ก ๋์ฒด\n",
98 "imputer_frequent = SimpleImputer(strategy='most_frequent')\n",
99 "df_frequent = pd.DataFrame(\n",
100 " imputer_frequent.fit_transform(df),\n",
101 " columns=df.columns\n",
102 ")\n",
103 "\n",
104 "# ์์๊ฐ์ผ๋ก ๋์ฒด\n",
105 "imputer_constant = SimpleImputer(strategy='constant', fill_value=0)\n",
106 "df_constant = pd.DataFrame(\n",
107 " imputer_constant.fit_transform(df),\n",
108 " columns=df.columns\n",
109 ")\n",
110 "\n",
111 "print(\"ํ๊ท ๊ฐ ๋์ฒด:\")\n",
112 "print(df_mean)\n",
113 "print(f\"\\n์ค์๊ฐ ๋์ฒด (age ์ปฌ๋ผ): {df_median['age'].values}\")\n",
114 "print(f\"์ต๋น๊ฐ ๋์ฒด (age ์ปฌ๋ผ): {df_frequent['age'].values}\")"
115 ]
116 },
117 {
118 "cell_type": "markdown",
119 "id": "cell-6",
120 "metadata": {},
121 "source": [
122 "### 1.2 KNNImputer - K-์ต๊ทผ์ ์ด์ ๋์ฒด"
123 ]
124 },
125 {
126 "cell_type": "code",
127 "execution_count": null,
128 "id": "cell-7",
129 "metadata": {},
130 "outputs": [],
131 "source": [
132 "# KNN ๊ธฐ๋ฐ ๊ฒฐ์ธก์น ๋์ฒด\n",
133 "imputer_knn = KNNImputer(n_neighbors=3)\n",
134 "df_knn = pd.DataFrame(\n",
135 " imputer_knn.fit_transform(df),\n",
136 " columns=df.columns\n",
137 ")\n",
138 "\n",
139 "print(\"KNN ๋์ฒด:\")\n",
140 "print(df_knn)\n",
141 "\n",
142 "# ์๊ฐํ ๋น๊ต\n",
143 "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n",
144 "\n",
145 "for ax, (method, df_filled) in zip(axes, [\n",
146 " ('Mean', df_mean), \n",
147 " ('Median', df_median), \n",
148 " ('KNN', df_knn)\n",
149 "]):\n",
150 " ax.scatter(df_filled['age'], df_filled['income'], alpha=0.7, s=100)\n",
151 " ax.set_xlabel('Age')\n",
152 " ax.set_ylabel('Income')\n",
153 " ax.set_title(f'{method} Imputation')\n",
154 " ax.grid(True, alpha=0.3)\n",
155 "\n",
156 "plt.tight_layout()\n",
157 "plt.show()"
158 ]
159 },
160 {
161 "cell_type": "markdown",
162 "id": "cell-8",
163 "metadata": {},
164 "source": [
165 "## 2. ํน์ฑ ์ค์ผ์ผ๋ง (Feature Scaling)"
166 ]
167 },
168 {
169 "cell_type": "code",
170 "execution_count": null,
171 "id": "cell-9",
172 "metadata": {},
173 "outputs": [],
174 "source": [
175 "# ์ค์ผ์ผ์ด ๋ค๋ฅธ ๋ฐ์ดํฐ ์์ฑ\n",
176 "np.random.seed(42)\n",
177 "data_scale = {\n",
178 " 'age': np.random.randint(20, 60, 100),\n",
179 " 'income': np.random.randint(30000, 150000, 100),\n",
180 " 'score': np.random.uniform(0, 100, 100)\n",
181 "}\n",
182 "df_scale = pd.DataFrame(data_scale)\n",
183 "\n",
184 "print(\"์๋ณธ ๋ฐ์ดํฐ ํต๊ณ:\")\n",
185 "print(df_scale.describe())"
186 ]
187 },
188 {
189 "cell_type": "markdown",
190 "id": "cell-10",
191 "metadata": {},
192 "source": [
193 "### 2.1 StandardScaler (ํ์คํ)"
194 ]
195 },
196 {
197 "cell_type": "code",
198 "execution_count": null,
199 "id": "cell-11",
200 "metadata": {},
201 "outputs": [],
202 "source": [
203 "# StandardScaler: (x - mean) / std\n",
204 "scaler_standard = StandardScaler()\n",
205 "df_standard = pd.DataFrame(\n",
206 " scaler_standard.fit_transform(df_scale),\n",
207 " columns=df_scale.columns\n",
208 ")\n",
209 "\n",
210 "print(\"StandardScaler ๊ฒฐ๊ณผ:\")\n",
211 "print(df_standard.describe())\n",
212 "print(f\"\\nํ๊ท : {df_standard.mean().values}\")\n",
213 "print(f\"ํ์คํธ์ฐจ: {df_standard.std().values}\")"
214 ]
215 },
216 {
217 "cell_type": "markdown",
218 "id": "cell-12",
219 "metadata": {},
220 "source": [
221 "### 2.2 MinMaxScaler (์ ๊ทํ)"
222 ]
223 },
224 {
225 "cell_type": "code",
226 "execution_count": null,
227 "id": "cell-13",
228 "metadata": {},
229 "outputs": [],
230 "source": [
231 "# MinMaxScaler: (x - min) / (max - min)\n",
232 "scaler_minmax = MinMaxScaler(feature_range=(0, 1))\n",
233 "df_minmax = pd.DataFrame(\n",
234 " scaler_minmax.fit_transform(df_scale),\n",
235 " columns=df_scale.columns\n",
236 ")\n",
237 "\n",
238 "print(\"MinMaxScaler ๊ฒฐ๊ณผ:\")\n",
239 "print(df_minmax.describe())\n",
240 "print(f\"\\n์ต์๊ฐ: {df_minmax.min().values}\")\n",
241 "print(f\"์ต๋๊ฐ: {df_minmax.max().values}\")"
242 ]
243 },
244 {
245 "cell_type": "markdown",
246 "id": "cell-14",
247 "metadata": {},
248 "source": [
249 "### 2.3 RobustScaler (์ด์์น์ ๊ฐ๊ฑด)"
250 ]
251 },
252 {
253 "cell_type": "code",
254 "execution_count": null,
255 "id": "cell-15",
256 "metadata": {},
257 "outputs": [],
258 "source": [
259 "# RobustScaler: (x - median) / IQR\n",
260 "scaler_robust = RobustScaler()\n",
261 "df_robust = pd.DataFrame(\n",
262 " scaler_robust.fit_transform(df_scale),\n",
263 " columns=df_scale.columns\n",
264 ")\n",
265 "\n",
266 "print(\"RobustScaler ๊ฒฐ๊ณผ:\")\n",
267 "print(df_robust.describe())"
268 ]
269 },
270 {
271 "cell_type": "markdown",
272 "id": "cell-16",
273 "metadata": {},
274 "source": [
275 "### 2.4 ์ค์ผ์ผ๋ฌ ๋น๊ต ์๊ฐํ"
276 ]
277 },
278 {
279 "cell_type": "code",
280 "execution_count": null,
281 "id": "cell-17",
282 "metadata": {},
283 "outputs": [],
284 "source": [
285 "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
286 "axes = axes.flatten()\n",
287 "\n",
288 "# ์ด์์น ์ถ๊ฐ\n",
289 "df_outlier = df_scale.copy()\n",
290 "df_outlier.loc[0, 'income'] = 500000 # ์ด์์น ์ถ๊ฐ\n",
291 "\n",
292 "scalers = [\n",
293 " ('Original', df_outlier),\n",
294 " ('StandardScaler', pd.DataFrame(StandardScaler().fit_transform(df_outlier), columns=df_outlier.columns)),\n",
295 " ('MinMaxScaler', pd.DataFrame(MinMaxScaler().fit_transform(df_outlier), columns=df_outlier.columns)),\n",
296 " ('RobustScaler', pd.DataFrame(RobustScaler().fit_transform(df_outlier), columns=df_outlier.columns))\n",
297 "]\n",
298 "\n",
299 "for ax, (name, data) in zip(axes, scalers):\n",
300 " ax.boxplot([data['age'], data['income'], data['score']], labels=['age', 'income', 'score'])\n",
301 " ax.set_title(name)\n",
302 " ax.set_ylabel('Value')\n",
303 " ax.grid(True, alpha=0.3)\n",
304 "\n",
305 "plt.tight_layout()\n",
306 "plt.show()"
307 ]
308 },
309 {
310 "cell_type": "markdown",
311 "id": "cell-18",
312 "metadata": {},
313 "source": [
314 "## 3. ๋ฒ์ฃผํ ๋ณ์ ์ธ์ฝ๋ฉ (Categorical Encoding)"
315 ]
316 },
317 {
318 "cell_type": "code",
319 "execution_count": null,
320 "id": "cell-19",
321 "metadata": {},
322 "outputs": [],
323 "source": [
324 "# ๋ฒ์ฃผํ ๋ฐ์ดํฐ ์ํ\n",
325 "data_cat = {\n",
326 " 'color': ['red', 'blue', 'green', 'red', 'blue', 'green', 'red'],\n",
327 " 'size': ['S', 'M', 'L', 'M', 'S', 'L', 'M'],\n",
328 " 'quality': ['good', 'excellent', 'poor', 'good', 'excellent', 'poor', 'good']\n",
329 "}\n",
330 "df_cat = pd.DataFrame(data_cat)\n",
331 "\n",
332 "print(\"๋ฒ์ฃผํ ๋ฐ์ดํฐ:\")\n",
333 "print(df_cat)"
334 ]
335 },
336 {
337 "cell_type": "markdown",
338 "id": "cell-20",
339 "metadata": {},
340 "source": [
341 "### 3.1 LabelEncoder (๋ ์ด๋ธ ์ธ์ฝ๋ฉ)"
342 ]
343 },
344 {
345 "cell_type": "code",
346 "execution_count": null,
347 "id": "cell-21",
348 "metadata": {},
349 "outputs": [],
350 "source": [
351 "# LabelEncoder: ๋ฒ์ฃผ๋ฅผ ์ ์๋ก ๋ณํ\n",
352 "le_color = LabelEncoder()\n",
353 "df_cat['color_encoded'] = le_color.fit_transform(df_cat['color'])\n",
354 "\n",
355 "print(\"LabelEncoder ๊ฒฐ๊ณผ:\")\n",
356 "print(df_cat[['color', 'color_encoded']])\n",
357 "print(f\"\\nํด๋์ค: {le_color.classes_}\")\n",
358 "print(f\"๋ณํ: {dict(zip(le_color.classes_, le_color.transform(le_color.classes_)))}\")"
359 ]
360 },
361 {
362 "cell_type": "markdown",
363 "id": "cell-22",
364 "metadata": {},
365 "source": [
366 "### 3.2 OneHotEncoder (์-ํซ ์ธ์ฝ๋ฉ)"
367 ]
368 },
369 {
370 "cell_type": "code",
371 "execution_count": null,
372 "id": "cell-23",
373 "metadata": {},
374 "outputs": [],
375 "source": [
376 "# OneHotEncoder: ๋ฒ์ฃผ๋ฅผ ์ด์ง ๋ฒกํฐ๋ก ๋ณํ\n",
377 "ohe = OneHotEncoder(sparse_output=False)\n",
378 "color_onehot = ohe.fit_transform(df_cat[['color']])\n",
379 "\n",
380 "# DataFrame์ผ๋ก ๋ณํ\n",
381 "df_onehot = pd.DataFrame(\n",
382 " color_onehot,\n",
383 " columns=ohe.get_feature_names_out(['color'])\n",
384 ")\n",
385 "\n",
386 "print(\"OneHotEncoder ๊ฒฐ๊ณผ:\")\n",
387 "print(pd.concat([df_cat['color'], df_onehot], axis=1))"
388 ]
389 },
390 {
391 "cell_type": "markdown",
392 "id": "cell-24",
393 "metadata": {},
394 "source": [
395 "### 3.3 OrdinalEncoder (์์ํ ์ธ์ฝ๋ฉ)"
396 ]
397 },
398 {
399 "cell_type": "code",
400 "execution_count": null,
401 "id": "cell-25",
402 "metadata": {},
403 "outputs": [],
404 "source": [
405 "# OrdinalEncoder: ์์๊ฐ ์๋ ๋ฒ์ฃผํ ๋ณ์\n",
406 "oe = OrdinalEncoder(categories=[['poor', 'good', 'excellent']])\n",
407 "df_cat['quality_encoded'] = oe.fit_transform(df_cat[['quality']])\n",
408 "\n",
409 "print(\"OrdinalEncoder ๊ฒฐ๊ณผ:\")\n",
410 "print(df_cat[['quality', 'quality_encoded']])\n",
411 "print(f\"\\n์์: poor(0) < good(1) < excellent(2)\")"
412 ]
413 },
414 {
415 "cell_type": "markdown",
416 "id": "cell-26",
417 "metadata": {},
418 "source": [
419 "### 3.4 Pandas get_dummies"
420 ]
421 },
422 {
423 "cell_type": "code",
424 "execution_count": null,
425 "id": "cell-27",
426 "metadata": {},
427 "outputs": [],
428 "source": [
429 "# pandas์ get_dummies (๊ฐํธํ ์-ํซ ์ธ์ฝ๋ฉ)\n",
430 "df_dummies = pd.get_dummies(df_cat[['color', 'size']], prefix=['color', 'size'])\n",
431 "\n",
432 "print(\"pd.get_dummies ๊ฒฐ๊ณผ:\")\n",
433 "print(df_dummies.head())\n",
434 "\n",
435 "# drop_first=True๋ก ๋ค์ค๊ณต์ ์ฑ ๋ฐฉ์ง\n",
436 "df_dummies_drop = pd.get_dummies(df_cat[['color', 'size']], prefix=['color', 'size'], drop_first=True)\n",
437 "print(f\"\\ndrop_first=True (shape: {df_dummies_drop.shape}):\")\n",
438 "print(df_dummies_drop.head())"
439 ]
440 },
441 {
442 "cell_type": "markdown",
443 "id": "cell-28",
444 "metadata": {},
445 "source": [
446 "## 4. ํน์ฑ ์ ํ (Feature Selection)"
447 ]
448 },
449 {
450 "cell_type": "code",
451 "execution_count": null,
452 "id": "cell-29",
453 "metadata": {},
454 "outputs": [],
455 "source": [
456 "from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif\n",
457 "from sklearn.feature_selection import RFE\n",
458 "from sklearn.ensemble import RandomForestClassifier\n",
459 "\n",
460 "# Iris ๋ฐ์ดํฐ ๋ก๋\n",
461 "iris = load_iris()\n",
462 "X, y = iris.data, iris.target\n",
463 "\n",
464 "print(f\"์๋ณธ ๋ฐ์ดํฐ: {X.shape}\")\n",
465 "print(f\"ํน์ฑ ์ด๋ฆ: {iris.feature_names}\")"
466 ]
467 },
468 {
469 "cell_type": "markdown",
470 "id": "cell-30",
471 "metadata": {},
472 "source": [
473 "### 4.1 SelectKBest (ํต๊ณ์ ์ ํ)"
474 ]
475 },
476 {
477 "cell_type": "code",
478 "execution_count": null,
479 "id": "cell-31",
480 "metadata": {},
481 "outputs": [],
482 "source": [
483 "# F-ํต๊ณ๋ ๊ธฐ๋ฐ ์ ํ\n",
484 "selector_f = SelectKBest(score_func=f_classif, k=2)\n",
485 "X_kbest_f = selector_f.fit_transform(X, y)\n",
486 "\n",
487 "# ์ํธ์ ๋ณด๋ ๊ธฐ๋ฐ ์ ํ\n",
488 "selector_mi = SelectKBest(score_func=mutual_info_classif, k=2)\n",
489 "X_kbest_mi = selector_mi.fit_transform(X, y)\n",
490 "\n",
491 "print(\"SelectKBest (F-statistic):\")\n",
492 "scores_f = pd.DataFrame({\n",
493 " 'Feature': iris.feature_names,\n",
494 " 'Score': selector_f.scores_\n",
495 "}).sort_values('Score', ascending=False)\n",
496 "print(scores_f)\n",
497 "\n",
498 "print(\"\\nSelectKBest (Mutual Information):\")\n",
499 "scores_mi = pd.DataFrame({\n",
500 " 'Feature': iris.feature_names,\n",
501 " 'Score': selector_mi.scores_\n",
502 "}).sort_values('Score', ascending=False)\n",
503 "print(scores_mi)"
504 ]
505 },
506 {
507 "cell_type": "markdown",
508 "id": "cell-32",
509 "metadata": {},
510 "source": [
511 "### 4.2 RFE (์ฌ๊ท์ ํน์ฑ ์ ๊ฑฐ)"
512 ]
513 },
514 {
515 "cell_type": "code",
516 "execution_count": null,
517 "id": "cell-33",
518 "metadata": {},
519 "outputs": [],
520 "source": [
521 "# RFE with Random Forest\n",
522 "estimator = RandomForestClassifier(n_estimators=50, random_state=42)\n",
523 "selector_rfe = RFE(estimator, n_features_to_select=2, step=1)\n",
524 "X_rfe = selector_rfe.fit_transform(X, y)\n",
525 "\n",
526 "print(\"RFE ๊ฒฐ๊ณผ:\")\n",
527 "rfe_result = pd.DataFrame({\n",
528 " 'Feature': iris.feature_names,\n",
529 " 'Selected': selector_rfe.support_,\n",
530 " 'Ranking': selector_rfe.ranking_\n",
531 "}).sort_values('Ranking')\n",
532 "print(rfe_result)"
533 ]
534 },
535 {
536 "cell_type": "markdown",
537 "id": "cell-34",
538 "metadata": {},
539 "source": [
540 "### 4.3 ํน์ฑ ์ค์๋ (๋๋ค ํฌ๋ ์คํธ)"
541 ]
542 },
543 {
544 "cell_type": "code",
545 "execution_count": null,
546 "id": "cell-35",
547 "metadata": {},
548 "outputs": [],
549 "source": [
550 "# ๋๋ค ํฌ๋ ์คํธ ํน์ฑ ์ค์๋\n",
551 "rf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
552 "rf.fit(X, y)\n",
553 "\n",
554 "importance = pd.DataFrame({\n",
555 " 'Feature': iris.feature_names,\n",
556 " 'Importance': rf.feature_importances_\n",
557 "}).sort_values('Importance', ascending=True)\n",
558 "\n",
559 "plt.figure(figsize=(10, 6))\n",
560 "plt.barh(importance['Feature'], importance['Importance'])\n",
561 "plt.xlabel('Importance')\n",
562 "plt.title('Random Forest Feature Importance - Iris Dataset')\n",
563 "plt.grid(True, alpha=0.3)\n",
564 "plt.tight_layout()\n",
565 "plt.show()"
566 ]
567 },
568 {
569 "cell_type": "markdown",
570 "id": "cell-36",
571 "metadata": {},
572 "source": [
573 "## 5. ๋ถ๊ท ํ ๋ฐ์ดํฐ ์ฒ๋ฆฌ (Imbalanced Data)"
574 ]
575 },
576 {
577 "cell_type": "code",
578 "execution_count": null,
579 "id": "cell-37",
580 "metadata": {},
581 "outputs": [],
582 "source": [
583 "from sklearn.datasets import make_classification\n",
584 "\n",
585 "# ๋ถ๊ท ํ ๋ฐ์ดํฐ ์์ฑ (10:1 ๋น์จ)\n",
586 "X_imb, y_imb = make_classification(\n",
587 " n_samples=1000,\n",
588 " n_features=20,\n",
589 " n_informative=15,\n",
590 " n_redundant=5,\n",
591 " n_classes=2,\n",
592 " weights=[0.9, 0.1], # 90% vs 10%\n",
593 " random_state=42\n",
594 ")\n",
595 "\n",
596 "# ํด๋์ค ๋ถํฌ ํ์ธ\n",
597 "unique, counts = np.unique(y_imb, return_counts=True)\n",
598 "print(\"ํด๋์ค ๋ถํฌ:\")\n",
599 "for cls, cnt in zip(unique, counts):\n",
600 " print(f\" Class {cls}: {cnt} ({cnt/len(y_imb)*100:.1f}%)\")\n",
601 "\n",
602 "# ์๊ฐํ\n",
603 "plt.figure(figsize=(8, 5))\n",
604 "plt.bar(['Class 0', 'Class 1'], counts, color=['skyblue', 'salmon'])\n",
605 "plt.ylabel('Count')\n",
606 "plt.title('Imbalanced Dataset Distribution')\n",
607 "plt.grid(True, alpha=0.3)\n",
608 "plt.show()"
609 ]
610 },
611 {
612 "cell_type": "markdown",
613 "id": "cell-38",
614 "metadata": {},
615 "source": [
616 "### 5.1 SMOTE ๊ฐ๋
(์ด๋ก )"
617 ]
618 },
619 {
620 "cell_type": "code",
621 "execution_count": null,
622 "id": "cell-39",
623 "metadata": {},
624 "outputs": [],
625 "source": [
626 "# SMOTE (Synthetic Minority Over-sampling Technique) ๊ฐ๋
์ค๋ช
\n",
627 "print(\"\"\"\n",
628 "SMOTE ์๋ ์๋ฆฌ:\n",
629 "\n",
630 "1. ์์ ํด๋์ค์ ๊ฐ ์ํ์ ๋ํด:\n",
631 " - K๊ฐ์ ์ต๊ทผ์ ์ด์์ ์ฐพ์ (๋ณดํต k=5)\n",
632 " \n",
633 "2. ๋๋คํ๊ฒ ์ ํ๋ ์ด์๊ณผ์ ์ ํ ๋ณด๊ฐ:\n",
634 " - new_sample = sample + ฮป ร (neighbor - sample)\n",
635 " - ฮป๋ 0๊ณผ 1 ์ฌ์ด์ ๋๋ค๊ฐ\n",
636 " \n",
637 "3. ํฉ์ฑ ์ํ์ ์์ฑํ์ฌ ์์ ํด๋์ค ์ฆ๊ฐ\n",
638 "\n",
639 "์ฅ์ :\n",
640 "- ๊ณผ์ ํฉ ์ํ์ด ๋ฎ์ (๋จ์ ๋ณต์ ๊ฐ ์๋)\n",
641 "- ๊ฒฐ์ ๊ฒฝ๊ณ๊ฐ ๋ ์ผ๋ฐํ๋จ\n",
642 "\n",
643 "๋จ์ :\n",
644 "- ๋
ธ์ด์ฆ์ ๋ฏผ๊ฐํ ์ ์์\n",
645 "- ๊ณ ์ฐจ์ ๋ฐ์ดํฐ์์๋ ํจ๊ณผ๊ฐ ์ ํ์ \n",
646 "\n",
647 "์ฌ์ฉ ๋ฐฉ๋ฒ:\n",
648 "- pip install imbalanced-learn\n",
649 "- from imblearn.over_sampling import SMOTE\n",
650 "- smote = SMOTE(random_state=42)\n",
651 "- X_resampled, y_resampled = smote.fit_resample(X, y)\n",
652 "\"\"\")"
653 ]
654 },
655 {
656 "cell_type": "markdown",
657 "id": "cell-40",
658 "metadata": {},
659 "source": [
660 "### 5.2 ํด๋์ค ๊ฐ์ค์น ์กฐ์ "
661 ]
662 },
663 {
664 "cell_type": "code",
665 "execution_count": null,
666 "id": "cell-41",
667 "metadata": {},
668 "outputs": [],
669 "source": [
670 "from sklearn.linear_model import LogisticRegression\n",
671 "from sklearn.metrics import classification_report, confusion_matrix\n",
672 "\n",
673 "X_train_imb, X_test_imb, y_train_imb, y_test_imb = train_test_split(\n",
674 " X_imb, y_imb, test_size=0.3, random_state=42\n",
675 ")\n",
676 "\n",
677 "# ๊ฐ์ค์น ์์\n",
678 "clf_no_weight = LogisticRegression(random_state=42, max_iter=1000)\n",
679 "clf_no_weight.fit(X_train_imb, y_train_imb)\n",
680 "y_pred_no_weight = clf_no_weight.predict(X_test_imb)\n",
681 "\n",
682 "# ๊ฐ์ค์น ์กฐ์ (balanced)\n",
683 "clf_balanced = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)\n",
684 "clf_balanced.fit(X_train_imb, y_train_imb)\n",
685 "y_pred_balanced = clf_balanced.predict(X_test_imb)\n",
686 "\n",
687 "print(\"=== ๊ฐ์ค์น ์์ ===\")\n",
688 "print(classification_report(y_test_imb, y_pred_no_weight))\n",
689 "\n",
690 "print(\"\\n=== ๊ฐ์ค์น ์กฐ์ (balanced) ===\")\n",
691 "print(classification_report(y_test_imb, y_pred_balanced))"
692 ]
693 },
694 {
695 "cell_type": "markdown",
696 "id": "cell-42",
697 "metadata": {},
698 "source": [
699 "## 6. ์ค์ ์ ์ฒ๋ฆฌ ํ์ดํ๋ผ์ธ"
700 ]
701 },
702 {
703 "cell_type": "code",
704 "execution_count": null,
705 "id": "cell-43",
706 "metadata": {},
707 "outputs": [],
708 "source": [
709 "from sklearn.pipeline import Pipeline\n",
710 "from sklearn.compose import ColumnTransformer\n",
711 "\n",
712 "# ํผํฉ ๋ฐ์ดํฐ ์์ฑ\n",
713 "data_mixed = {\n",
714 " 'age': [25, np.nan, 35, 40, 30, 45, np.nan, 28],\n",
715 " 'income': [50000, 60000, np.nan, 80000, 70000, 90000, 55000, np.nan],\n",
716 " 'city': ['Seoul', 'Busan', 'Seoul', 'Daegu', 'Busan', 'Seoul', 'Daegu', 'Busan'],\n",
717 " 'education': ['Bachelor', 'Master', 'PhD', 'Bachelor', 'Master', 'PhD', 'Bachelor', 'Master'],\n",
718 " 'purchased': [0, 1, 1, 0, 1, 1, 0, 1]\n",
719 "}\n",
720 "df_mixed = pd.DataFrame(data_mixed)\n",
721 "\n",
722 "X_mixed = df_mixed.drop('purchased', axis=1)\n",
723 "y_mixed = df_mixed['purchased']\n",
724 "\n",
725 "print(\"ํผํฉ ๋ฐ์ดํฐ:\")\n",
726 "print(df_mixed)"
727 ]
728 },
729 {
730 "cell_type": "code",
731 "execution_count": null,
732 "id": "cell-44",
733 "metadata": {},
734 "outputs": [],
735 "source": [
736 "# ์์นํ/๋ฒ์ฃผํ ํน์ฑ ๋ถ๋ฆฌ\n",
737 "numeric_features = ['age', 'income']\n",
738 "categorical_features = ['city', 'education']\n",
739 "\n",
740 "# ์์นํ ์ ์ฒ๋ฆฌ ํ์ดํ๋ผ์ธ\n",
741 "numeric_transformer = Pipeline(steps=[\n",
742 " ('imputer', SimpleImputer(strategy='median')),\n",
743 " ('scaler', StandardScaler())\n",
744 "])\n",
745 "\n",
746 "# ๋ฒ์ฃผํ ์ ์ฒ๋ฆฌ ํ์ดํ๋ผ์ธ\n",
747 "categorical_transformer = Pipeline(steps=[\n",
748 " ('imputer', SimpleImputer(strategy='most_frequent')),\n",
749 " ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))\n",
750 "])\n",
751 "\n",
752 "# ColumnTransformer๋ก ๊ฒฐํฉ\n",
753 "preprocessor = ColumnTransformer(\n",
754 " transformers=[\n",
755 " ('num', numeric_transformer, numeric_features),\n",
756 " ('cat', categorical_transformer, categorical_features)\n",
757 " ]\n",
758 ")\n",
759 "\n",
760 "# ์ ์ฒด ํ์ดํ๋ผ์ธ\n",
761 "pipeline = Pipeline(steps=[\n",
762 " ('preprocessor', preprocessor),\n",
763 " ('classifier', LogisticRegression(random_state=42))\n",
764 "])\n",
765 "\n",
766 "# ํ์ต (์์ ๋ฐ์ดํฐ์ด๋ฏ๋ก ์ ์ฒด ์ฌ์ฉ)\n",
767 "pipeline.fit(X_mixed, y_mixed)\n",
768 "\n",
769 "# ์๋ก์ด ๋ฐ์ดํฐ ์์ธก\n",
770 "new_data = pd.DataFrame({\n",
771 " 'age': [30],\n",
772 " 'income': [70000],\n",
773 " 'city': ['Seoul'],\n",
774 " 'education': ['Master']\n",
775 "})\n",
776 "\n",
777 "prediction = pipeline.predict(new_data)\n",
778 "probability = pipeline.predict_proba(new_data)\n",
779 "\n",
780 "print(f\"\\n์์ธก ๊ฒฐ๊ณผ: {prediction[0]}\")\n",
781 "print(f\"ํ๋ฅ : {probability[0]}\")"
782 ]
783 },
784 {
785 "cell_type": "markdown",
786 "id": "cell-45",
787 "metadata": {},
788 "source": [
789 "## ์ ๋ฆฌ\n",
790 "\n",
791 "### ํต์ฌ ๊ฐ๋
\n",
792 "\n",
793 "**๊ฒฐ์ธก์น ์ฒ๋ฆฌ:**\n",
794 "- **SimpleImputer**: ํ๊ท , ์ค์๊ฐ, ์ต๋น๊ฐ, ์์๋ก ๋์ฒด\n",
795 "- **KNNImputer**: K-์ต๊ทผ์ ์ด์ ๊ธฐ๋ฐ ๋์ฒด\n",
796 "\n",
797 "**ํน์ฑ ์ค์ผ์ผ๋ง:**\n",
798 "- **StandardScaler**: ํ๊ท 0, ํ์คํธ์ฐจ 1 (์ ๊ท๋ถํฌ ๊ฐ์ )\n",
799 "- **MinMaxScaler**: 0-1 ๋ฒ์๋ก ์ ๊ทํ\n",
800 "- **RobustScaler**: ์ค์๊ฐ๊ณผ IQR ์ฌ์ฉ (์ด์์น์ ๊ฐ๊ฑด)\n",
801 "\n",
802 "**๋ฒ์ฃผํ ์ธ์ฝ๋ฉ:**\n",
803 "- **LabelEncoder**: ์์ ์๋ ๋ถ๋ฅ (ํ๊ฒ ๋ณ์์ฉ)\n",
804 "- **OneHotEncoder**: ์ด์ง ๋ฒกํฐ๋ก ๋ณํ (๋ค์ค๊ณต์ ์ฑ ์ฃผ์)\n",
805 "- **OrdinalEncoder**: ์์๊ฐ ์๋ ๋ฒ์ฃผํ\n",
806 "\n",
807 "**๋ถ๊ท ํ ๋ฐ์ดํฐ:**\n",
808 "- **SMOTE**: ํฉ์ฑ ์ํ ์์ฑ (์๊ตฌ์ฌํญ: imbalanced-learn)\n",
809 "- **class_weight**: ๋ชจ๋ธ ๊ฐ์ค์น ์กฐ์ \n",
810 "\n",
811 "### ๋ค์ ๋จ๊ณ\n",
812 "- Pipeline๊ณผ ColumnTransformer ํ์ฉ\n",
813 "- ๊ต์ฐจ ๊ฒ์ฆ๊ณผ ์ ์ฒ๋ฆฌ ํตํฉ\n",
814 "- ์ค์ ํ๋ก์ ํธ ์ ์ฉ"
815 ]
816 }
817 ],
818 "metadata": {
819 "kernelspec": {
820 "display_name": "Python 3",
821 "language": "python",
822 "name": "python3"
823 },
824 "language_info": {
825 "codemirror_mode": {
826 "name": "ipython",
827 "version": 3
828 },
829 "file_extension": ".py",
830 "mimetype": "text/x-python",
831 "name": "python",
832 "nbconvert_exporter": "python",
833 "pygments_lexer": "ipython3",
834 "version": "3.8.0"
835 }
836 },
837 "nbformat": 4,
838 "nbformat_minor": 5
839}