05_data_cleaning.py

Download
python 421 lines 13.1 KB
  1"""
  2๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ (Data Cleaning/Preprocessing)
  3Data Cleaning and Preprocessing Techniques
  4
  5์‹ค์ œ ๋ฐ์ดํ„ฐ ๋ถ„์„์—์„œ ๊ฐ€์žฅ ์ค‘์š”ํ•œ ์ „์ฒ˜๋ฆฌ ๊ธฐ๋ฒ•์„ ๋‹ค๋ฃน๋‹ˆ๋‹ค.
  6"""
  7
  8import numpy as np
  9import pandas as pd
 10from typing import List, Tuple
 11
 12
 13# =============================================================================
 14# 1. ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ
 15# =============================================================================
 16def handle_missing_values():
 17    """๊ฒฐ์ธก์น˜ ํƒ์ง€ ๋ฐ ์ฒ˜๋ฆฌ"""
 18    print("\n[1] ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ")
 19    print("=" * 50)
 20
 21    # ๊ฒฐ์ธก์น˜๊ฐ€ ์žˆ๋Š” ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
 22    df = pd.DataFrame({
 23        'A': [1, 2, np.nan, 4, 5],
 24        'B': [np.nan, 2, 3, np.nan, 5],
 25        'C': [1, 2, 3, 4, 5],
 26        'D': ['a', None, 'c', 'd', np.nan]
 27    })
 28
 29    print("์›๋ณธ ๋ฐ์ดํ„ฐ:")
 30    print(df)
 31    print()
 32
 33    # ๊ฒฐ์ธก์น˜ ํƒ์ง€
 34    print("๊ฒฐ์ธก์น˜ ๊ฐœ์ˆ˜:")
 35    print(df.isnull().sum())
 36    print(f"\n๊ฒฐ์ธก์น˜ ๋น„์œจ:\n{df.isnull().mean() * 100}")
 37
 38    # ์ฒ˜๋ฆฌ ๋ฐฉ๋ฒ•๋“ค
 39    print("\n--- ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ ๋ฐฉ๋ฒ• ---")
 40
 41    # 1. ํ–‰ ์‚ญ์ œ
 42    df_dropna = df.dropna()
 43    print(f"\n1. ํ–‰ ์‚ญ์ œ (dropna):\n{df_dropna}")
 44
 45    # 2. ํŠน์ • ์—ด์—์„œ๋งŒ ์‚ญ์ œ
 46    df_drop_subset = df.dropna(subset=['A', 'C'])
 47    print(f"\n2. A, C ์—ด ๊ธฐ์ค€ ์‚ญ์ œ:\n{df_drop_subset}")
 48
 49    # 3. ๊ฐ’์œผ๋กœ ์ฑ„์šฐ๊ธฐ
 50    df_fillna = df.copy()
 51    df_fillna['A'] = df_fillna['A'].fillna(df_fillna['A'].mean())
 52    df_fillna['B'] = df_fillna['B'].fillna(df_fillna['B'].median())
 53    print(f"\n3. ํ‰๊ท /์ค‘์•™๊ฐ’์œผ๋กœ ์ฑ„์šฐ๊ธฐ:\n{df_fillna}")
 54
 55    # 4. ์ „๋ฐฉ/ํ›„๋ฐฉ ์ฑ„์šฐ๊ธฐ
 56    df_ffill = df.fillna(method='ffill')
 57    print(f"\n4. ์ „๋ฐฉ ์ฑ„์šฐ๊ธฐ (ffill):\n{df_ffill}")
 58
 59    # 5. ๋ณด๊ฐ„๋ฒ•
 60    df_interpolate = df.copy()
 61    df_interpolate['A'] = df_interpolate['A'].interpolate()
 62    df_interpolate['B'] = df_interpolate['B'].interpolate()
 63    print(f"\n5. ๋ณด๊ฐ„๋ฒ• (interpolate):\n{df_interpolate}")
 64
 65
 66# =============================================================================
 67# 2. ์ด์ƒ์น˜ ํƒ์ง€ ๋ฐ ์ฒ˜๋ฆฌ
 68# =============================================================================
 69def handle_outliers():
 70    """์ด์ƒ์น˜ ํƒ์ง€ ๋ฐ ์ฒ˜๋ฆฌ"""
 71    print("\n[2] ์ด์ƒ์น˜ ํƒ์ง€ ๋ฐ ์ฒ˜๋ฆฌ")
 72    print("=" * 50)
 73
 74    np.random.seed(42)
 75
 76    # ์ด์ƒ์น˜๊ฐ€ ํฌํ•จ๋œ ๋ฐ์ดํ„ฐ
 77    normal_data = np.random.normal(100, 10, 100)
 78    outliers = np.array([200, -50, 250])
 79    data = np.concatenate([normal_data, outliers])
 80    np.random.shuffle(data)
 81
 82    df = pd.DataFrame({'value': data})
 83
 84    print(f"๋ฐ์ดํ„ฐ ํฌ๊ธฐ: {len(df)}")
 85    print(f"ํ‰๊ท : {df['value'].mean():.2f}")
 86    print(f"ํ‘œ์ค€ํŽธ์ฐจ: {df['value'].std():.2f}")
 87
 88    # ๋ฐฉ๋ฒ• 1: IQR ๋ฐฉ๋ฒ•
 89    print("\n--- IQR ๋ฐฉ๋ฒ• ---")
 90    Q1 = df['value'].quantile(0.25)
 91    Q3 = df['value'].quantile(0.75)
 92    IQR = Q3 - Q1
 93    lower_bound = Q1 - 1.5 * IQR
 94    upper_bound = Q3 + 1.5 * IQR
 95
 96    print(f"Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")
 97    print(f"์ •์ƒ ๋ฒ”์œ„: [{lower_bound:.2f}, {upper_bound:.2f}]")
 98
 99    outliers_iqr = df[(df['value'] < lower_bound) | (df['value'] > upper_bound)]
100    print(f"์ด์ƒ์น˜ ๊ฐœ์ˆ˜: {len(outliers_iqr)}")
101    print(f"์ด์ƒ์น˜ ๊ฐ’: {outliers_iqr['value'].values}")
102
103    # ๋ฐฉ๋ฒ• 2: Z-score ๋ฐฉ๋ฒ•
104    print("\n--- Z-score ๋ฐฉ๋ฒ• ---")
105    z_scores = np.abs((df['value'] - df['value'].mean()) / df['value'].std())
106    outliers_z = df[z_scores > 3]
107    print(f"์ด์ƒ์น˜ ๊ฐœ์ˆ˜ (|z| > 3): {len(outliers_z)}")
108
109    # ์ด์ƒ์น˜ ์ฒ˜๋ฆฌ
110    print("\n--- ์ด์ƒ์น˜ ์ฒ˜๋ฆฌ ---")
111
112    # 1. ์ œ๊ฑฐ
113    df_no_outliers = df[(df['value'] >= lower_bound) & (df['value'] <= upper_bound)]
114    print(f"1. ์ œ๊ฑฐ ํ›„ ํฌ๊ธฐ: {len(df_no_outliers)}")
115
116    # 2. ๊ฒฝ๊ณ„๊ฐ’์œผ๋กœ ๋Œ€์ฒด (Winsorizing)
117    df_winsorized = df.copy()
118    df_winsorized['value'] = df_winsorized['value'].clip(lower_bound, upper_bound)
119    print(f"2. Winsorizing ํ›„ ์ตœ๋Œ€๊ฐ’: {df_winsorized['value'].max():.2f}")
120
121    # 3. ์ค‘์•™๊ฐ’์œผ๋กœ ๋Œ€์ฒด
122    df_median = df.copy()
123    median_val = df['value'].median()
124    df_median.loc[(df['value'] < lower_bound) | (df['value'] > upper_bound), 'value'] = median_val
125    print(f"3. ์ค‘์•™๊ฐ’ ๋Œ€์ฒด ํ›„ ํ‰๊ท : {df_median['value'].mean():.2f}")
126
127
128# =============================================================================
129# 3. ๋ฐ์ดํ„ฐ ํƒ€์ž… ๋ณ€ํ™˜
130# =============================================================================
131def data_type_conversion():
132    """๋ฐ์ดํ„ฐ ํƒ€์ž… ๋ณ€ํ™˜"""
133    print("\n[3] ๋ฐ์ดํ„ฐ ํƒ€์ž… ๋ณ€ํ™˜")
134    print("=" * 50)
135
136    df = pd.DataFrame({
137        'int_col': ['1', '2', '3', '4', '5'],
138        'float_col': ['1.1', '2.2', '3.3', '4.4', '5.5'],
139        'date_col': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'],
140        'bool_col': ['True', 'False', 'True', 'False', 'True'],
141        'cat_col': ['A', 'B', 'A', 'C', 'B']
142    })
143
144    print("์›๋ณธ ๋ฐ์ดํ„ฐ ํƒ€์ž…:")
145    print(df.dtypes)
146    print()
147
148    # ํƒ€์ž… ๋ณ€ํ™˜
149    df['int_col'] = df['int_col'].astype(int)
150    df['float_col'] = df['float_col'].astype(float)
151    df['date_col'] = pd.to_datetime(df['date_col'])
152    df['bool_col'] = df['bool_col'].map({'True': True, 'False': False})
153    df['cat_col'] = df['cat_col'].astype('category')
154
155    print("๋ณ€ํ™˜ ํ›„ ๋ฐ์ดํ„ฐ ํƒ€์ž…:")
156    print(df.dtypes)
157    print()
158
159    print("๋ณ€ํ™˜๋œ ๋ฐ์ดํ„ฐ:")
160    print(df)
161
162    # ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰ ๋น„๊ต
163    print(f"\n์นดํ…Œ๊ณ ๋ฆฌ ํƒ€์ž… ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ:")
164    print(f"  object ํƒ€์ž…: {df['cat_col'].astype('object').memory_usage()} bytes")
165    print(f"  category ํƒ€์ž…: {df['cat_col'].memory_usage()} bytes")
166
167
168# =============================================================================
169# 4. ์ค‘๋ณต ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
170# =============================================================================
171def handle_duplicates():
172    """์ค‘๋ณต ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ"""
173    print("\n[4] ์ค‘๋ณต ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ")
174    print("=" * 50)
175
176    df = pd.DataFrame({
177        'name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob', 'David'],
178        'age': [25, 30, 25, 35, 30, 40],
179        'city': ['Seoul', 'Busan', 'Seoul', 'Daegu', 'Busan', 'Seoul']
180    })
181
182    print("์›๋ณธ ๋ฐ์ดํ„ฐ:")
183    print(df)
184
185    # ์ค‘๋ณต ํ™•์ธ
186    print(f"\n์ค‘๋ณต ํ–‰ ์ˆ˜: {df.duplicated().sum()}")
187    print("์ค‘๋ณต๋œ ํ–‰:")
188    print(df[df.duplicated()])
189
190    # ํŠน์ • ์—ด ๊ธฐ์ค€ ์ค‘๋ณต
191    print(f"\n'name' ๊ธฐ์ค€ ์ค‘๋ณต ์ˆ˜: {df.duplicated(subset=['name']).sum()}")
192
193    # ์ค‘๋ณต ์ œ๊ฑฐ
194    df_unique = df.drop_duplicates()
195    print(f"\n์ค‘๋ณต ์ œ๊ฑฐ ํ›„:\n{df_unique}")
196
197    df_unique_name = df.drop_duplicates(subset=['name'], keep='first')
198    print(f"\n'name' ๊ธฐ์ค€ ์ค‘๋ณต ์ œ๊ฑฐ (์ฒซ ๋ฒˆ์งธ ์œ ์ง€):\n{df_unique_name}")
199
200
201# =============================================================================
202# 5. ์ •๊ทœํ™”์™€ ํ‘œ์ค€ํ™”
203# =============================================================================
204def normalization_standardization():
205    """์ •๊ทœํ™”์™€ ํ‘œ์ค€ํ™”"""
206    print("\n[5] ์ •๊ทœํ™”์™€ ํ‘œ์ค€ํ™”")
207    print("=" * 50)
208
209    np.random.seed(42)
210
211    df = pd.DataFrame({
212        'feature1': np.random.normal(100, 15, 10),
213        'feature2': np.random.normal(50, 5, 10),
214        'feature3': np.random.exponential(10, 10)
215    })
216
217    print("์›๋ณธ ๋ฐ์ดํ„ฐ ํ†ต๊ณ„:")
218    print(df.describe().round(2))
219
220    # 1. Min-Max ์ •๊ทœํ™” (0-1 ์Šค์ผ€์ผ๋ง)
221    df_minmax = df.copy()
222    for col in df.columns:
223        min_val = df[col].min()
224        max_val = df[col].max()
225        df_minmax[col] = (df[col] - min_val) / (max_val - min_val)
226
227    print("\n1. Min-Max ์ •๊ทœํ™” (0-1):")
228    print(df_minmax.describe().round(4))
229
230    # 2. Z-score ํ‘œ์ค€ํ™”
231    df_zscore = df.copy()
232    for col in df.columns:
233        mean_val = df[col].mean()
234        std_val = df[col].std()
235        df_zscore[col] = (df[col] - mean_val) / std_val
236
237    print("\n2. Z-score ํ‘œ์ค€ํ™”:")
238    print(df_zscore.describe().round(4))
239
240    # 3. Robust ์Šค์ผ€์ผ๋ง (์ด์ƒ์น˜์— ๊ฐ•๊ฑด)
241    df_robust = df.copy()
242    for col in df.columns:
243        median_val = df[col].median()
244        iqr = df[col].quantile(0.75) - df[col].quantile(0.25)
245        df_robust[col] = (df[col] - median_val) / iqr
246
247    print("\n3. Robust ์Šค์ผ€์ผ๋ง (IQR ๊ธฐ๋ฐ˜):")
248    print(df_robust.describe().round(4))
249
250
251# =============================================================================
252# 6. ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ์ธ์ฝ”๋”ฉ
253# =============================================================================
254def categorical_encoding():
255    """๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ์ธ์ฝ”๋”ฉ"""
256    print("\n[6] ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ์ธ์ฝ”๋”ฉ")
257    print("=" * 50)
258
259    df = pd.DataFrame({
260        'color': ['red', 'blue', 'green', 'blue', 'red'],
261        'size': ['S', 'M', 'L', 'M', 'S'],
262        'price': [100, 150, 200, 150, 100]
263    })
264
265    print("์›๋ณธ ๋ฐ์ดํ„ฐ:")
266    print(df)
267
268    # 1. ๋ผ๋ฒจ ์ธ์ฝ”๋”ฉ
269    print("\n1. ๋ผ๋ฒจ ์ธ์ฝ”๋”ฉ:")
270    df_label = df.copy()
271    df_label['color_encoded'] = df_label['color'].astype('category').cat.codes
272    df_label['size_encoded'] = df_label['size'].map({'S': 0, 'M': 1, 'L': 2})
273    print(df_label)
274
275    # 2. ์›-ํ•ซ ์ธ์ฝ”๋”ฉ
276    print("\n2. ์›-ํ•ซ ์ธ์ฝ”๋”ฉ:")
277    df_onehot = pd.get_dummies(df, columns=['color', 'size'])
278    print(df_onehot)
279
280    # 3. ๋นˆ๋„ ์ธ์ฝ”๋”ฉ
281    print("\n3. ๋นˆ๋„ ์ธ์ฝ”๋”ฉ:")
282    df_freq = df.copy()
283    freq_map = df['color'].value_counts() / len(df)
284    df_freq['color_freq'] = df_freq['color'].map(freq_map)
285    print(df_freq)
286
287
288# =============================================================================
289# 7. ๋ฌธ์ž์—ด ์ฒ˜๋ฆฌ
290# =============================================================================
291def string_processing():
292    """๋ฌธ์ž์—ด ์ฒ˜๋ฆฌ"""
293    print("\n[7] ๋ฌธ์ž์—ด ์ฒ˜๋ฆฌ")
294    print("=" * 50)
295
296    df = pd.DataFrame({
297        'name': ['  John Doe  ', 'jane smith', 'BOB JONES', 'Alice Brown'],
298        'email': ['john@example.com', 'jane@EXAMPLE.COM', 'bob@Example.com', 'alice@example.com'],
299        'phone': ['010-1234-5678', '01098765432', '010 1111 2222', '010.3333.4444']
300    })
301
302    print("์›๋ณธ ๋ฐ์ดํ„ฐ:")
303    print(df)
304
305    # ๋ฌธ์ž์—ด ์ฒ˜๋ฆฌ
306    df_clean = df.copy()
307
308    # ๊ณต๋ฐฑ ์ œ๊ฑฐ ๋ฐ ๋Œ€์†Œ๋ฌธ์ž ์ •๋ฆฌ
309    df_clean['name'] = df_clean['name'].str.strip().str.title()
310
311    # ์†Œ๋ฌธ์ž ๋ณ€ํ™˜
312    df_clean['email'] = df_clean['email'].str.lower()
313
314    # ์ „ํ™”๋ฒˆํ˜ธ ์ •๊ทœํ™”
315    df_clean['phone'] = df_clean['phone'].str.replace(r'[^0-9]', '', regex=True)
316
317    print("\n์ •๋ฆฌ๋œ ๋ฐ์ดํ„ฐ:")
318    print(df_clean)
319
320    # ๋ฌธ์ž์—ด ์ถ”์ถœ
321    print("\n๋ฌธ์ž์—ด ๋ถ„๋ฆฌ:")
322    df_clean[['first_name', 'last_name']] = df_clean['name'].str.split(' ', n=1, expand=True)
323    print(df_clean[['name', 'first_name', 'last_name']])
324
325
326# =============================================================================
327# 8. ๋‚ ์งœ/์‹œ๊ฐ„ ์ฒ˜๋ฆฌ
328# =============================================================================
329def datetime_processing():
330    """๋‚ ์งœ/์‹œ๊ฐ„ ์ฒ˜๋ฆฌ"""
331    print("\n[8] ๋‚ ์งœ/์‹œ๊ฐ„ ์ฒ˜๋ฆฌ")
332    print("=" * 50)
333
334    df = pd.DataFrame({
335        'date_str': ['2024-01-15', '2024/02/20', '15-Mar-2024', '2024.04.10'],
336        'timestamp': pd.date_range('2024-01-01', periods=4, freq='ME'),
337        'value': [100, 150, 120, 180]
338    })
339
340    print("์›๋ณธ ๋ฐ์ดํ„ฐ:")
341    print(df)
342
343    # ๋‚ ์งœ ํŒŒ์‹ฑ
344    df['date_parsed'] = pd.to_datetime(df['date_str'])
345
346    # ๋‚ ์งœ ์š”์†Œ ์ถ”์ถœ
347    df['year'] = df['timestamp'].dt.year
348    df['month'] = df['timestamp'].dt.month
349    df['day'] = df['timestamp'].dt.day
350    df['weekday'] = df['timestamp'].dt.day_name()
351    df['quarter'] = df['timestamp'].dt.quarter
352
353    print("\n๋‚ ์งœ ์š”์†Œ ์ถ”์ถœ:")
354    print(df[['timestamp', 'year', 'month', 'day', 'weekday', 'quarter']])
355
356    # ๋‚ ์งœ ์—ฐ์‚ฐ
357    df['days_since'] = (pd.Timestamp('2024-12-31') - df['timestamp']).dt.days
358
359    print("\n๋‚ ์งœ ์—ฐ์‚ฐ (2024-12-31๊นŒ์ง€ ๋‚จ์€ ์ผ์ˆ˜):")
360    print(df[['timestamp', 'days_since']])
361
362
363# =============================================================================
364# ๋ฉ”์ธ
365# =============================================================================
366def main():
367    print("=" * 60)
368    print("๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์˜ˆ์ œ")
369    print("=" * 60)
370
371    handle_missing_values()
372    handle_outliers()
373    data_type_conversion()
374    handle_duplicates()
375    normalization_standardization()
376    categorical_encoding()
377    string_processing()
378    datetime_processing()
379
380    print("\n" + "=" * 60)
381    print("๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์ฒดํฌ๋ฆฌ์ŠคํŠธ")
382    print("=" * 60)
383    print("""
384    1. ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ํ™•์ธ
385       - head(), info(), describe()
386       - shape, dtypes
387
388    2. ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ
389       - isnull().sum() ์œผ๋กœ ํ™•์ธ
390       - ์‚ญ์ œ ๋˜๋Š” ๋Œ€์ฒด (ํ‰๊ท , ์ค‘์•™๊ฐ’, ์ตœ๋นˆ๊ฐ’, ๋ณด๊ฐ„)
391
392    3. ์ด์ƒ์น˜ ์ฒ˜๋ฆฌ
393       - IQR ๋˜๋Š” Z-score๋กœ ํƒ์ง€
394       - ์ œ๊ฑฐ, ๊ฒฝ๊ณ„๊ฐ’ ๋Œ€์ฒด, ๋˜๋Š” ๋ณ€ํ™˜
395
396    4. ๋ฐ์ดํ„ฐ ํƒ€์ž… ๋ณ€ํ™˜
397       - ์ˆซ์ž, ๋‚ ์งœ, ๋ฒ”์ฃผํ˜•์œผ๋กœ ์ ์ ˆํžˆ ๋ณ€ํ™˜
398       - category ํƒ€์ž…์œผ๋กœ ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ
399
400    5. ์ค‘๋ณต ์ œ๊ฑฐ
401       - duplicated() ํ™•์ธ
402       - drop_duplicates()
403
404    6. ์Šค์ผ€์ผ๋ง/์ •๊ทœํ™”
405       - Min-Max: ๋ฒ”์œ„๊ฐ€ ์ค‘์š”ํ•  ๋•Œ
406       - Z-score: ๋ถ„ํฌ๊ฐ€ ์ค‘์š”ํ•  ๋•Œ
407       - Robust: ์ด์ƒ์น˜๊ฐ€ ์žˆ์„ ๋•Œ
408
409    7. ๋ฒ”์ฃผํ˜• ์ธ์ฝ”๋”ฉ
410       - ๋ผ๋ฒจ ์ธ์ฝ”๋”ฉ: ์ˆœ์„œ๊ฐ€ ์žˆ๋Š” ๋ณ€์ˆ˜
411       - ์›-ํ•ซ ์ธ์ฝ”๋”ฉ: ์ˆœ์„œ๊ฐ€ ์—†๋Š” ๋ณ€์ˆ˜
412
413    8. ๋ฌธ์ž์—ด/๋‚ ์งœ ์ •๋ฆฌ
414       - ๊ณต๋ฐฑ ์ œ๊ฑฐ, ๋Œ€์†Œ๋ฌธ์ž ํ†ต์ผ
415       - ๋‚ ์งœ ํŒŒ์‹ฑ ๋ฐ ์š”์†Œ ์ถ”์ถœ
416    """)
417
418
419if __name__ == "__main__":
420    main()