1"""
2๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ (Data Cleaning/Preprocessing)
3Data Cleaning and Preprocessing Techniques
4
5์ค์ ๋ฐ์ดํฐ ๋ถ์์์ ๊ฐ์ฅ ์ค์ํ ์ ์ฒ๋ฆฌ ๊ธฐ๋ฒ์ ๋ค๋ฃน๋๋ค.
6"""
7
8import numpy as np
9import pandas as pd
10from typing import List, Tuple
11
12
13# =============================================================================
14# 1. ๊ฒฐ์ธก์น ์ฒ๋ฆฌ
15# =============================================================================
16def handle_missing_values():
17 """๊ฒฐ์ธก์น ํ์ง ๋ฐ ์ฒ๋ฆฌ"""
18 print("\n[1] ๊ฒฐ์ธก์น ์ฒ๋ฆฌ")
19 print("=" * 50)
20
21 # ๊ฒฐ์ธก์น๊ฐ ์๋ ๋ฐ์ดํฐ ์์ฑ
22 df = pd.DataFrame({
23 'A': [1, 2, np.nan, 4, 5],
24 'B': [np.nan, 2, 3, np.nan, 5],
25 'C': [1, 2, 3, 4, 5],
26 'D': ['a', None, 'c', 'd', np.nan]
27 })
28
29 print("์๋ณธ ๋ฐ์ดํฐ:")
30 print(df)
31 print()
32
33 # ๊ฒฐ์ธก์น ํ์ง
34 print("๊ฒฐ์ธก์น ๊ฐ์:")
35 print(df.isnull().sum())
36 print(f"\n๊ฒฐ์ธก์น ๋น์จ:\n{df.isnull().mean() * 100}")
37
38 # ์ฒ๋ฆฌ ๋ฐฉ๋ฒ๋ค
39 print("\n--- ๊ฒฐ์ธก์น ์ฒ๋ฆฌ ๋ฐฉ๋ฒ ---")
40
41 # 1. ํ ์ญ์
42 df_dropna = df.dropna()
43 print(f"\n1. ํ ์ญ์ (dropna):\n{df_dropna}")
44
45 # 2. ํน์ ์ด์์๋ง ์ญ์
46 df_drop_subset = df.dropna(subset=['A', 'C'])
47 print(f"\n2. A, C ์ด ๊ธฐ์ค ์ญ์ :\n{df_drop_subset}")
48
49 # 3. ๊ฐ์ผ๋ก ์ฑ์ฐ๊ธฐ
50 df_fillna = df.copy()
51 df_fillna['A'] = df_fillna['A'].fillna(df_fillna['A'].mean())
52 df_fillna['B'] = df_fillna['B'].fillna(df_fillna['B'].median())
53 print(f"\n3. ํ๊ท /์ค์๊ฐ์ผ๋ก ์ฑ์ฐ๊ธฐ:\n{df_fillna}")
54
55 # 4. ์ ๋ฐฉ/ํ๋ฐฉ ์ฑ์ฐ๊ธฐ
56 df_ffill = df.fillna(method='ffill')
57 print(f"\n4. ์ ๋ฐฉ ์ฑ์ฐ๊ธฐ (ffill):\n{df_ffill}")
58
59 # 5. ๋ณด๊ฐ๋ฒ
60 df_interpolate = df.copy()
61 df_interpolate['A'] = df_interpolate['A'].interpolate()
62 df_interpolate['B'] = df_interpolate['B'].interpolate()
63 print(f"\n5. ๋ณด๊ฐ๋ฒ (interpolate):\n{df_interpolate}")
64
65
66# =============================================================================
67# 2. ์ด์์น ํ์ง ๋ฐ ์ฒ๋ฆฌ
68# =============================================================================
69def handle_outliers():
70 """์ด์์น ํ์ง ๋ฐ ์ฒ๋ฆฌ"""
71 print("\n[2] ์ด์์น ํ์ง ๋ฐ ์ฒ๋ฆฌ")
72 print("=" * 50)
73
74 np.random.seed(42)
75
76 # ์ด์์น๊ฐ ํฌํจ๋ ๋ฐ์ดํฐ
77 normal_data = np.random.normal(100, 10, 100)
78 outliers = np.array([200, -50, 250])
79 data = np.concatenate([normal_data, outliers])
80 np.random.shuffle(data)
81
82 df = pd.DataFrame({'value': data})
83
84 print(f"๋ฐ์ดํฐ ํฌ๊ธฐ: {len(df)}")
85 print(f"ํ๊ท : {df['value'].mean():.2f}")
86 print(f"ํ์คํธ์ฐจ: {df['value'].std():.2f}")
87
88 # ๋ฐฉ๋ฒ 1: IQR ๋ฐฉ๋ฒ
89 print("\n--- IQR ๋ฐฉ๋ฒ ---")
90 Q1 = df['value'].quantile(0.25)
91 Q3 = df['value'].quantile(0.75)
92 IQR = Q3 - Q1
93 lower_bound = Q1 - 1.5 * IQR
94 upper_bound = Q3 + 1.5 * IQR
95
96 print(f"Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")
97 print(f"์ ์ ๋ฒ์: [{lower_bound:.2f}, {upper_bound:.2f}]")
98
99 outliers_iqr = df[(df['value'] < lower_bound) | (df['value'] > upper_bound)]
100 print(f"์ด์์น ๊ฐ์: {len(outliers_iqr)}")
101 print(f"์ด์์น ๊ฐ: {outliers_iqr['value'].values}")
102
103 # ๋ฐฉ๋ฒ 2: Z-score ๋ฐฉ๋ฒ
104 print("\n--- Z-score ๋ฐฉ๋ฒ ---")
105 z_scores = np.abs((df['value'] - df['value'].mean()) / df['value'].std())
106 outliers_z = df[z_scores > 3]
107 print(f"์ด์์น ๊ฐ์ (|z| > 3): {len(outliers_z)}")
108
109 # ์ด์์น ์ฒ๋ฆฌ
110 print("\n--- ์ด์์น ์ฒ๋ฆฌ ---")
111
112 # 1. ์ ๊ฑฐ
113 df_no_outliers = df[(df['value'] >= lower_bound) & (df['value'] <= upper_bound)]
114 print(f"1. ์ ๊ฑฐ ํ ํฌ๊ธฐ: {len(df_no_outliers)}")
115
116 # 2. ๊ฒฝ๊ณ๊ฐ์ผ๋ก ๋์ฒด (Winsorizing)
117 df_winsorized = df.copy()
118 df_winsorized['value'] = df_winsorized['value'].clip(lower_bound, upper_bound)
119 print(f"2. Winsorizing ํ ์ต๋๊ฐ: {df_winsorized['value'].max():.2f}")
120
121 # 3. ์ค์๊ฐ์ผ๋ก ๋์ฒด
122 df_median = df.copy()
123 median_val = df['value'].median()
124 df_median.loc[(df['value'] < lower_bound) | (df['value'] > upper_bound), 'value'] = median_val
125 print(f"3. ์ค์๊ฐ ๋์ฒด ํ ํ๊ท : {df_median['value'].mean():.2f}")
126
127
128# =============================================================================
129# 3. ๋ฐ์ดํฐ ํ์
๋ณํ
130# =============================================================================
131def data_type_conversion():
132 """๋ฐ์ดํฐ ํ์
๋ณํ"""
133 print("\n[3] ๋ฐ์ดํฐ ํ์
๋ณํ")
134 print("=" * 50)
135
136 df = pd.DataFrame({
137 'int_col': ['1', '2', '3', '4', '5'],
138 'float_col': ['1.1', '2.2', '3.3', '4.4', '5.5'],
139 'date_col': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'],
140 'bool_col': ['True', 'False', 'True', 'False', 'True'],
141 'cat_col': ['A', 'B', 'A', 'C', 'B']
142 })
143
144 print("์๋ณธ ๋ฐ์ดํฐ ํ์
:")
145 print(df.dtypes)
146 print()
147
148 # ํ์
๋ณํ
149 df['int_col'] = df['int_col'].astype(int)
150 df['float_col'] = df['float_col'].astype(float)
151 df['date_col'] = pd.to_datetime(df['date_col'])
152 df['bool_col'] = df['bool_col'].map({'True': True, 'False': False})
153 df['cat_col'] = df['cat_col'].astype('category')
154
155 print("๋ณํ ํ ๋ฐ์ดํฐ ํ์
:")
156 print(df.dtypes)
157 print()
158
159 print("๋ณํ๋ ๋ฐ์ดํฐ:")
160 print(df)
161
162 # ๋ฉ๋ชจ๋ฆฌ ์ฌ์ฉ๋ ๋น๊ต
163 print(f"\n์นดํ
๊ณ ๋ฆฌ ํ์
๋ฉ๋ชจ๋ฆฌ ์ ์ฝ:")
164 print(f" object ํ์
: {df['cat_col'].astype('object').memory_usage()} bytes")
165 print(f" category ํ์
: {df['cat_col'].memory_usage()} bytes")
166
167
168# =============================================================================
169# 4. ์ค๋ณต ๋ฐ์ดํฐ ์ฒ๋ฆฌ
170# =============================================================================
171def handle_duplicates():
172 """์ค๋ณต ๋ฐ์ดํฐ ์ฒ๋ฆฌ"""
173 print("\n[4] ์ค๋ณต ๋ฐ์ดํฐ ์ฒ๋ฆฌ")
174 print("=" * 50)
175
176 df = pd.DataFrame({
177 'name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob', 'David'],
178 'age': [25, 30, 25, 35, 30, 40],
179 'city': ['Seoul', 'Busan', 'Seoul', 'Daegu', 'Busan', 'Seoul']
180 })
181
182 print("์๋ณธ ๋ฐ์ดํฐ:")
183 print(df)
184
185 # ์ค๋ณต ํ์ธ
186 print(f"\n์ค๋ณต ํ ์: {df.duplicated().sum()}")
187 print("์ค๋ณต๋ ํ:")
188 print(df[df.duplicated()])
189
190 # ํน์ ์ด ๊ธฐ์ค ์ค๋ณต
191 print(f"\n'name' ๊ธฐ์ค ์ค๋ณต ์: {df.duplicated(subset=['name']).sum()}")
192
193 # ์ค๋ณต ์ ๊ฑฐ
194 df_unique = df.drop_duplicates()
195 print(f"\n์ค๋ณต ์ ๊ฑฐ ํ:\n{df_unique}")
196
197 df_unique_name = df.drop_duplicates(subset=['name'], keep='first')
198 print(f"\n'name' ๊ธฐ์ค ์ค๋ณต ์ ๊ฑฐ (์ฒซ ๋ฒ์งธ ์ ์ง):\n{df_unique_name}")
199
200
201# =============================================================================
202# 5. ์ ๊ทํ์ ํ์คํ
203# =============================================================================
204def normalization_standardization():
205 """์ ๊ทํ์ ํ์คํ"""
206 print("\n[5] ์ ๊ทํ์ ํ์คํ")
207 print("=" * 50)
208
209 np.random.seed(42)
210
211 df = pd.DataFrame({
212 'feature1': np.random.normal(100, 15, 10),
213 'feature2': np.random.normal(50, 5, 10),
214 'feature3': np.random.exponential(10, 10)
215 })
216
217 print("์๋ณธ ๋ฐ์ดํฐ ํต๊ณ:")
218 print(df.describe().round(2))
219
220 # 1. Min-Max ์ ๊ทํ (0-1 ์ค์ผ์ผ๋ง)
221 df_minmax = df.copy()
222 for col in df.columns:
223 min_val = df[col].min()
224 max_val = df[col].max()
225 df_minmax[col] = (df[col] - min_val) / (max_val - min_val)
226
227 print("\n1. Min-Max ์ ๊ทํ (0-1):")
228 print(df_minmax.describe().round(4))
229
230 # 2. Z-score ํ์คํ
231 df_zscore = df.copy()
232 for col in df.columns:
233 mean_val = df[col].mean()
234 std_val = df[col].std()
235 df_zscore[col] = (df[col] - mean_val) / std_val
236
237 print("\n2. Z-score ํ์คํ:")
238 print(df_zscore.describe().round(4))
239
240 # 3. Robust ์ค์ผ์ผ๋ง (์ด์์น์ ๊ฐ๊ฑด)
241 df_robust = df.copy()
242 for col in df.columns:
243 median_val = df[col].median()
244 iqr = df[col].quantile(0.75) - df[col].quantile(0.25)
245 df_robust[col] = (df[col] - median_val) / iqr
246
247 print("\n3. Robust ์ค์ผ์ผ๋ง (IQR ๊ธฐ๋ฐ):")
248 print(df_robust.describe().round(4))
249
250
251# =============================================================================
252# 6. ๋ฒ์ฃผํ ๋ณ์ ์ธ์ฝ๋ฉ
253# =============================================================================
254def categorical_encoding():
255 """๋ฒ์ฃผํ ๋ณ์ ์ธ์ฝ๋ฉ"""
256 print("\n[6] ๋ฒ์ฃผํ ๋ณ์ ์ธ์ฝ๋ฉ")
257 print("=" * 50)
258
259 df = pd.DataFrame({
260 'color': ['red', 'blue', 'green', 'blue', 'red'],
261 'size': ['S', 'M', 'L', 'M', 'S'],
262 'price': [100, 150, 200, 150, 100]
263 })
264
265 print("์๋ณธ ๋ฐ์ดํฐ:")
266 print(df)
267
268 # 1. ๋ผ๋ฒจ ์ธ์ฝ๋ฉ
269 print("\n1. ๋ผ๋ฒจ ์ธ์ฝ๋ฉ:")
270 df_label = df.copy()
271 df_label['color_encoded'] = df_label['color'].astype('category').cat.codes
272 df_label['size_encoded'] = df_label['size'].map({'S': 0, 'M': 1, 'L': 2})
273 print(df_label)
274
275 # 2. ์-ํซ ์ธ์ฝ๋ฉ
276 print("\n2. ์-ํซ ์ธ์ฝ๋ฉ:")
277 df_onehot = pd.get_dummies(df, columns=['color', 'size'])
278 print(df_onehot)
279
280 # 3. ๋น๋ ์ธ์ฝ๋ฉ
281 print("\n3. ๋น๋ ์ธ์ฝ๋ฉ:")
282 df_freq = df.copy()
283 freq_map = df['color'].value_counts() / len(df)
284 df_freq['color_freq'] = df_freq['color'].map(freq_map)
285 print(df_freq)
286
287
288# =============================================================================
289# 7. ๋ฌธ์์ด ์ฒ๋ฆฌ
290# =============================================================================
291def string_processing():
292 """๋ฌธ์์ด ์ฒ๋ฆฌ"""
293 print("\n[7] ๋ฌธ์์ด ์ฒ๋ฆฌ")
294 print("=" * 50)
295
296 df = pd.DataFrame({
297 'name': [' John Doe ', 'jane smith', 'BOB JONES', 'Alice Brown'],
298 'email': ['john@example.com', 'jane@EXAMPLE.COM', 'bob@Example.com', 'alice@example.com'],
299 'phone': ['010-1234-5678', '01098765432', '010 1111 2222', '010.3333.4444']
300 })
301
302 print("์๋ณธ ๋ฐ์ดํฐ:")
303 print(df)
304
305 # ๋ฌธ์์ด ์ฒ๋ฆฌ
306 df_clean = df.copy()
307
308 # ๊ณต๋ฐฑ ์ ๊ฑฐ ๋ฐ ๋์๋ฌธ์ ์ ๋ฆฌ
309 df_clean['name'] = df_clean['name'].str.strip().str.title()
310
311 # ์๋ฌธ์ ๋ณํ
312 df_clean['email'] = df_clean['email'].str.lower()
313
314 # ์ ํ๋ฒํธ ์ ๊ทํ
315 df_clean['phone'] = df_clean['phone'].str.replace(r'[^0-9]', '', regex=True)
316
317 print("\n์ ๋ฆฌ๋ ๋ฐ์ดํฐ:")
318 print(df_clean)
319
320 # ๋ฌธ์์ด ์ถ์ถ
321 print("\n๋ฌธ์์ด ๋ถ๋ฆฌ:")
322 df_clean[['first_name', 'last_name']] = df_clean['name'].str.split(' ', n=1, expand=True)
323 print(df_clean[['name', 'first_name', 'last_name']])
324
325
326# =============================================================================
327# 8. ๋ ์ง/์๊ฐ ์ฒ๋ฆฌ
328# =============================================================================
329def datetime_processing():
330 """๋ ์ง/์๊ฐ ์ฒ๋ฆฌ"""
331 print("\n[8] ๋ ์ง/์๊ฐ ์ฒ๋ฆฌ")
332 print("=" * 50)
333
334 df = pd.DataFrame({
335 'date_str': ['2024-01-15', '2024/02/20', '15-Mar-2024', '2024.04.10'],
336 'timestamp': pd.date_range('2024-01-01', periods=4, freq='ME'),
337 'value': [100, 150, 120, 180]
338 })
339
340 print("์๋ณธ ๋ฐ์ดํฐ:")
341 print(df)
342
343 # ๋ ์ง ํ์ฑ
344 df['date_parsed'] = pd.to_datetime(df['date_str'])
345
346 # ๋ ์ง ์์ ์ถ์ถ
347 df['year'] = df['timestamp'].dt.year
348 df['month'] = df['timestamp'].dt.month
349 df['day'] = df['timestamp'].dt.day
350 df['weekday'] = df['timestamp'].dt.day_name()
351 df['quarter'] = df['timestamp'].dt.quarter
352
353 print("\n๋ ์ง ์์ ์ถ์ถ:")
354 print(df[['timestamp', 'year', 'month', 'day', 'weekday', 'quarter']])
355
356 # ๋ ์ง ์ฐ์ฐ
357 df['days_since'] = (pd.Timestamp('2024-12-31') - df['timestamp']).dt.days
358
359 print("\n๋ ์ง ์ฐ์ฐ (2024-12-31๊น์ง ๋จ์ ์ผ์):")
360 print(df[['timestamp', 'days_since']])
361
362
363# =============================================================================
364# ๋ฉ์ธ
365# =============================================================================
366def main():
367 print("=" * 60)
368 print("๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ์์ ")
369 print("=" * 60)
370
371 handle_missing_values()
372 handle_outliers()
373 data_type_conversion()
374 handle_duplicates()
375 normalization_standardization()
376 categorical_encoding()
377 string_processing()
378 datetime_processing()
379
380 print("\n" + "=" * 60)
381 print("๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ์ฒดํฌ๋ฆฌ์คํธ")
382 print("=" * 60)
383 print("""
384 1. ๋ฐ์ดํฐ ๋ก๋ ๋ฐ ํ์ธ
385 - head(), info(), describe()
386 - shape, dtypes
387
388 2. ๊ฒฐ์ธก์น ์ฒ๋ฆฌ
389 - isnull().sum() ์ผ๋ก ํ์ธ
390 - ์ญ์ ๋๋ ๋์ฒด (ํ๊ท , ์ค์๊ฐ, ์ต๋น๊ฐ, ๋ณด๊ฐ)
391
392 3. ์ด์์น ์ฒ๋ฆฌ
393 - IQR ๋๋ Z-score๋ก ํ์ง
394 - ์ ๊ฑฐ, ๊ฒฝ๊ณ๊ฐ ๋์ฒด, ๋๋ ๋ณํ
395
396 4. ๋ฐ์ดํฐ ํ์
๋ณํ
397 - ์ซ์, ๋ ์ง, ๋ฒ์ฃผํ์ผ๋ก ์ ์ ํ ๋ณํ
398 - category ํ์
์ผ๋ก ๋ฉ๋ชจ๋ฆฌ ์ ์ฝ
399
400 5. ์ค๋ณต ์ ๊ฑฐ
401 - duplicated() ํ์ธ
402 - drop_duplicates()
403
404 6. ์ค์ผ์ผ๋ง/์ ๊ทํ
405 - Min-Max: ๋ฒ์๊ฐ ์ค์ํ ๋
406 - Z-score: ๋ถํฌ๊ฐ ์ค์ํ ๋
407 - Robust: ์ด์์น๊ฐ ์์ ๋
408
409 7. ๋ฒ์ฃผํ ์ธ์ฝ๋ฉ
410 - ๋ผ๋ฒจ ์ธ์ฝ๋ฉ: ์์๊ฐ ์๋ ๋ณ์
411 - ์-ํซ ์ธ์ฝ๋ฉ: ์์๊ฐ ์๋ ๋ณ์
412
413 8. ๋ฌธ์์ด/๋ ์ง ์ ๋ฆฌ
414 - ๊ณต๋ฐฑ ์ ๊ฑฐ, ๋์๋ฌธ์ ํต์ผ
415 - ๋ ์ง ํ์ฑ ๋ฐ ์์ ์ถ์ถ
416 """)
417
418
419if __name__ == "__main__":
420 main()