6. Data Preprocessing
6. Data Preprocessing¶
Previous: Pandas Advanced | Next: Descriptive Stats & EDA
Overview¶
Data preprocessing is the process of cleaning and transforming data before analysis or modeling. This covers core techniques including handling missing values, outlier detection, normalization, and encoding.
1. ๊ฒฐ์ธก์น ์ฒ๋ฆฌ¶
1.1 ๊ฒฐ์ธก์น ํ์ธ¶
import pandas as pd
import numpy as np
df = pd.DataFrame({
'A': [1, 2, None, 4, 5],
'B': [None, 2, 3, None, 5],
'C': ['a', 'b', None, 'd', 'e'],
'D': [1.0, 2.0, 3.0, 4.0, 5.0]
})
# ๊ฒฐ์ธก์น ํ์ธ
print(df.isna()) # ๋ถ๋ฆฌ์ธ ๋ง์คํฌ
print(df.isna().sum()) # ์ด๋ณ ๊ฒฐ์ธก์น ์
print(df.isna().sum().sum()) # ์ ์ฒด ๊ฒฐ์ธก์น ์
# ๊ฒฐ์ธก์น ๋น์จ
print(df.isna().mean() * 100)
# ๊ฒฐ์ธก์น๊ฐ ์๋ ํ/์ด
print(df[df.isna().any(axis=1)]) # ๊ฒฐ์ธก์น๊ฐ ์๋ ํ
print(df.columns[df.isna().any()]) # ๊ฒฐ์ธก์น๊ฐ ์๋ ์ด
# ๊ฒฐ์ธก์น ์๊ฐํ (missingno ๋ผ์ด๋ธ๋ฌ๋ฆฌ)
# import missingno as msno
# msno.matrix(df)
1.2 ๊ฒฐ์ธก์น ์ ๊ฑฐ¶
df = pd.DataFrame({
'A': [1, 2, None, 4],
'B': [None, 2, 3, 4],
'C': [1, None, 3, None]
})
# ๊ฒฐ์ธก์น๊ฐ ์๋ ํ ์ ๊ฑฐ
print(df.dropna())
# ๋ชจ๋ ๊ฐ์ด ๊ฒฐ์ธก์ธ ํ ์ ๊ฑฐ
print(df.dropna(how='all'))
# ํน์ ์ด ๊ธฐ์ค
print(df.dropna(subset=['A']))
print(df.dropna(subset=['A', 'B']))
# ์๊ณ๊ฐ ์ค์ (์ต์ ๋น๊ฒฐ์ธก๊ฐ ๊ฐ์)
print(df.dropna(thresh=2)) # ์ต์ 2๊ฐ์ ๋น๊ฒฐ์ธก๊ฐ
1.3 ๊ฒฐ์ธก์น ๋์ฒด¶
df = pd.DataFrame({
'numeric': [1, 2, None, 4, 5, None],
'category': ['A', 'B', None, 'A', 'B', 'A']
})
# ํน์ ๊ฐ์ผ๋ก ๋์ฒด
df_filled = df.fillna(0)
df_filled = df.fillna({'numeric': 0, 'category': 'Unknown'})
# ํต๊ณ๊ฐ์ผ๋ก ๋์ฒด
df['numeric'] = df['numeric'].fillna(df['numeric'].mean()) # ํ๊ท
df['numeric'] = df['numeric'].fillna(df['numeric'].median()) # ์ค์๊ฐ
df['category'] = df['category'].fillna(df['category'].mode()[0]) # ์ต๋น๊ฐ
# ์/๋ค ๊ฐ์ผ๋ก ๋์ฒด
df_ffill = df.fillna(method='ffill') # ์์ ๊ฐ์ผ๋ก
df_bfill = df.fillna(method='bfill') # ๋ค์ ๊ฐ์ผ๋ก
# ๋ณด๊ฐ (interpolation)
df['numeric'] = df['numeric'].interpolate(method='linear')
df['numeric'] = df['numeric'].interpolate(method='polynomial', order=2)
1.4 ๊ทธ๋ฃน๋ณ ๊ฒฐ์ธก์น ์ฒ๋ฆฌ¶
df = pd.DataFrame({
'group': ['A', 'A', 'B', 'B', 'A', 'B'],
'value': [1, None, 3, None, 5, 6]
})
# ๊ทธ๋ฃน๋ณ ํ๊ท ์ผ๋ก ๋์ฒด
df['value'] = df.groupby('group')['value'].transform(
lambda x: x.fillna(x.mean())
)
print(df)
2. ์ด์์น ํ์ง¶
2.1 ํต๊ณ์ ๋ฐฉ๋ฒ¶
df = pd.DataFrame({
'value': [10, 12, 11, 13, 100, 11, 12, 10, 9, 11]
})
# IQR ๋ฐฉ๋ฒ
Q1 = df['value'].quantile(0.25)
Q3 = df['value'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['value'] < lower_bound) | (df['value'] > upper_bound)]
print("์ด์์น:", outliers)
# Z-score ๋ฐฉ๋ฒ
from scipy import stats
z_scores = np.abs(stats.zscore(df['value']))
outliers = df[z_scores > 3] # |z| > 3์ธ ๊ฒฝ์ฐ
print("์ด์์น:", outliers)
# ์์ ๋ Z-score (MAD ๊ธฐ๋ฐ)
median = df['value'].median()
mad = np.median(np.abs(df['value'] - median))
modified_z = 0.6745 * (df['value'] - median) / mad
outliers = df[np.abs(modified_z) > 3.5]
2.2 ์๊ฐ์ ๋ฐฉ๋ฒ¶
import matplotlib.pyplot as plt
df = pd.DataFrame({
'value': np.concatenate([np.random.randn(100), [10, -10]])
})
# ๋ฐ์คํ๋กฏ
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].boxplot(df['value'])
axes[0].set_title('Box Plot')
# ํ์คํ ๊ทธ๋จ
axes[1].hist(df['value'], bins=30, edgecolor='black')
axes[1].set_title('Histogram')
plt.tight_layout()
plt.show()
2.3 ์ด์์น ์ฒ๋ฆฌ¶
df = pd.DataFrame({
'value': [10, 12, 11, 13, 100, 11, 12, 10, 9, -50]
})
# 1. ์ ๊ฑฐ
Q1, Q3 = df['value'].quantile([0.25, 0.75])
IQR = Q3 - Q1
df_clean = df[(df['value'] >= Q1 - 1.5 * IQR) &
(df['value'] <= Q3 + 1.5 * IQR)]
# 2. ๋์ฒด (ํด๋ฆฌํ)
lower = df['value'].quantile(0.05)
upper = df['value'].quantile(0.95)
df['value_clipped'] = df['value'].clip(lower, upper)
# 3. ์์ ํ (Winsorizing)
from scipy.stats import mstats
df['value_winsorized'] = mstats.winsorize(df['value'], limits=[0.05, 0.05])
# 4. ๋ก๊ทธ ๋ณํ (์๋๊ฐ ํฐ ๋ฐ์ดํฐ)
df['value_log'] = np.log1p(df['value'] - df['value'].min() + 1)
3. ๋ฐ์ดํฐ ์ ๊ทํ/ํ์คํ¶
3.1 Min-Max ์ ๊ทํ¶
df = pd.DataFrame({
'A': [10, 20, 30, 40, 50],
'B': [100, 200, 300, 400, 500]
})
# ์๋ ๊ตฌํ
df_normalized = (df - df.min()) / (df.max() - df.min())
print(df_normalized)
# sklearn ์ฌ์ฉ
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(
scaler.fit_transform(df),
columns=df.columns
)
print(df_normalized)
3.2 ํ์คํ (Z-score)¶
df = pd.DataFrame({
'A': [10, 20, 30, 40, 50],
'B': [100, 200, 300, 400, 500]
})
# ์๋ ๊ตฌํ
df_standardized = (df - df.mean()) / df.std()
print(df_standardized)
# sklearn ์ฌ์ฉ
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_standardized = pd.DataFrame(
scaler.fit_transform(df),
columns=df.columns
)
print(df_standardized)
3.3 ๋ค์ํ ์ค์ผ์ผ๋ง ๋ฐฉ๋ฒ¶
from sklearn.preprocessing import (
MinMaxScaler, StandardScaler, RobustScaler,
MaxAbsScaler, QuantileTransformer, PowerTransformer
)
df = pd.DataFrame({
'value': [1, 2, 3, 4, 5, 100] # ์ด์์น ํฌํจ
})
# RobustScaler (์ด์์น์ ๊ฐ๊ฑด)
scaler = RobustScaler() # ์ค์๊ฐ๊ณผ IQR ์ฌ์ฉ
robust_scaled = scaler.fit_transform(df)
# MaxAbsScaler (์ ๋๊ฐ ์ต๋๋ก ์ค์ผ์ผ๋ง)
scaler = MaxAbsScaler()
maxabs_scaled = scaler.fit_transform(df)
# QuantileTransformer (๋ถ์์ ๊ธฐ๋ฐ)
scaler = QuantileTransformer(output_distribution='normal')
quantile_scaled = scaler.fit_transform(df)
# PowerTransformer (์ ๊ท๋ถํฌ์ ๊ฐ๊น๊ฒ)
scaler = PowerTransformer(method='yeo-johnson')
power_scaled = scaler.fit_transform(df)
4. ๋ฒ์ฃผํ ๋ณ์ ์ธ์ฝ๋ฉ¶
4.1 ๋ ์ด๋ธ ์ธ์ฝ๋ฉ¶
from sklearn.preprocessing import LabelEncoder
df = pd.DataFrame({
'color': ['red', 'blue', 'green', 'red', 'blue']
})
# sklearn LabelEncoder
le = LabelEncoder()
df['color_encoded'] = le.fit_transform(df['color'])
print(df)
print("ํด๋์ค:", le.classes_)
# ์ญ๋ณํ
original = le.inverse_transform(df['color_encoded'])
print("์๋ณธ:", original)
# pandas factorize
codes, uniques = pd.factorize(df['color'])
df['color_factorized'] = codes
print(df)
4.2 ์-ํซ ์ธ์ฝ๋ฉ¶
df = pd.DataFrame({
'color': ['red', 'blue', 'green', 'red'],
'size': ['S', 'M', 'L', 'M']
})
# pandas get_dummies
df_encoded = pd.get_dummies(df, columns=['color', 'size'])
print(df_encoded)
# drop_first ์ต์
(๋ค์ค๊ณต์ ์ฑ ๋ฐฉ์ง)
df_encoded = pd.get_dummies(df, columns=['color'], drop_first=True)
print(df_encoded)
# sklearn OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded = encoder.fit_transform(df[['color', 'size']])
print(encoded)
print("ํน์ฑ ์ด๋ฆ:", encoder.get_feature_names_out())
4.3 ์์ํ ์ธ์ฝ๋ฉ¶
from sklearn.preprocessing import OrdinalEncoder
df = pd.DataFrame({
'education': ['high school', 'bachelor', 'master', 'phd', 'bachelor']
})
# ์์ ์ง์
order = ['high school', 'bachelor', 'master', 'phd']
# sklearn OrdinalEncoder
encoder = OrdinalEncoder(categories=[order])
df['education_encoded'] = encoder.fit_transform(df[['education']])
print(df)
# pandas Categorical
df['education_cat'] = pd.Categorical(
df['education'],
categories=order,
ordered=True
)
df['education_codes'] = df['education_cat'].cat.codes
print(df)
4.4 ๋น๋ ์ธ์ฝ๋ฉ¶
df = pd.DataFrame({
'category': ['A', 'B', 'A', 'C', 'B', 'A', 'A', 'C']
})
# ๋น๋ ๊ณ์ฐ
freq_map = df['category'].value_counts() / len(df)
df['category_freq'] = df['category'].map(freq_map)
print(df)
4.5 ํ๊ฒ ์ธ์ฝ๋ฉ¶
df = pd.DataFrame({
'category': ['A', 'B', 'A', 'C', 'B', 'A'],
'target': [1, 0, 1, 0, 1, 1]
})
# ์นดํ
๊ณ ๋ฆฌ๋ณ ํ๊ฒ ํ๊ท
target_mean = df.groupby('category')['target'].mean()
df['category_target_encoded'] = df['category'].map(target_mean)
print(df)
# ๊ณผ์ ํฉ ๋ฐฉ์ง๋ฅผ ์ํ ์ค๋ฌด๋ฉ
def target_encode_smoothed(df, col, target, weight=10):
global_mean = df[target].mean()
agg = df.groupby(col)[target].agg(['mean', 'count'])
smoothed = (agg['count'] * agg['mean'] + weight * global_mean) / (agg['count'] + weight)
return df[col].map(smoothed)
df['category_smoothed'] = target_encode_smoothed(df, 'category', 'target')
5. ์์นํ ๋ณํ¶
5.1 ๋ก๊ทธ ๋ณํ¶
df = pd.DataFrame({
'value': [1, 10, 100, 1000, 10000]
})
# ๋ก๊ทธ ๋ณํ
df['log'] = np.log(df['value'])
df['log10'] = np.log10(df['value'])
df['log1p'] = np.log1p(df['value']) # log(1 + x), 0 ์ฒ๋ฆฌ ๊ฐ๋ฅ
print(df)
5.2 Box-Cox / Yeo-Johnson ๋ณํ¶
from scipy import stats
from sklearn.preprocessing import PowerTransformer
df = pd.DataFrame({
'value': [1, 2, 5, 10, 50, 100, 500]
})
# Box-Cox (์์๋ง ๊ฐ๋ฅ)
df['boxcox'], lambda_param = stats.boxcox(df['value'])
print(f"์ต์ ๋๋ค: {lambda_param}")
# Yeo-Johnson (์์๋ ๊ฐ๋ฅ)
pt = PowerTransformer(method='yeo-johnson')
df['yeojohnson'] = pt.fit_transform(df[['value']])
print(df)
5.3 ๊ตฌ๊ฐํ (Binning)¶
df = pd.DataFrame({
'age': [15, 22, 35, 45, 55, 65, 75, 85]
})
# ๋์ผ ๊ฐ๊ฒฉ ๊ตฌ๊ฐํ
df['age_bin_equal'] = pd.cut(df['age'], bins=4)
# Usage์ ์ ์ ๊ตฌ๊ฐ
bins = [0, 20, 40, 60, 100]
labels = ['youth', 'adult', 'middle', 'senior']
df['age_bin_custom'] = pd.cut(df['age'], bins=bins, labels=labels)
# ๋์ผ ๋น๋ ๊ตฌ๊ฐํ
df['age_qcut'] = pd.qcut(df['age'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
print(df)
6. ๋ ์ง/์๊ฐ ์ฒ๋ฆฌ¶
6.1 ๋ ์ง ํ์ฑ¶
df = pd.DataFrame({
'date_str': ['2023-01-15', '2023/02/20', '15-03-2023', '04.25.2023']
})
# ์๋ ํ์ฑ
df['date1'] = pd.to_datetime(df['date_str'].iloc[0:2])
# ํ์ ์ง์
df['date'] = pd.to_datetime(df['date_str'], format='mixed', dayfirst=True)
# ์ค๋ฅ ์ฒ๋ฆฌ
df['date'] = pd.to_datetime(df['date_str'], errors='coerce') # ์ค๋ฅ ์ NaT
6.2 ๋ ์ง ํน์ฑ ์ถ์ถ¶
df = pd.DataFrame({
'timestamp': pd.date_range('2023-01-01', periods=100, freq='D')
})
# Basic ํน์ฑ
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day
df['dayofweek'] = df['timestamp'].dt.dayofweek # 0=์์์ผ
df['dayofyear'] = df['timestamp'].dt.dayofyear
df['weekofyear'] = df['timestamp'].dt.isocalendar().week
df['quarter'] = df['timestamp'].dt.quarter
# ๋ถ๋ฆฌ์ธ ํน์ฑ
df['is_weekend'] = df['timestamp'].dt.dayofweek >= 5
df['is_month_start'] = df['timestamp'].dt.is_month_start
df['is_month_end'] = df['timestamp'].dt.is_month_end
# ์ฃผ๊ธฐ์ ํน์ฑ (์ผ๊ฐํจ์ ์ธ์ฝ๋ฉ)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
print(df.head())
7. ํ ์คํธ ์ ์ฒ๋ฆฌ¶
7.1 ๊ธฐ๋ณธ ์ ์ ¶
df = pd.DataFrame({
'text': [' Hello, World! ', 'PYTHON 3.9', 'data-science', None]
})
# ์๋ฌธ์ ๋ณํ
df['lower'] = df['text'].str.lower()
# ๊ณต๋ฐฑ ์ ๊ฑฐ
df['stripped'] = df['text'].str.strip()
# ํน์๋ฌธ์ ์ ๊ฑฐ
df['cleaned'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)
# ์ซ์ ์ ๊ฑฐ
df['no_numbers'] = df['text'].str.replace(r'\d+', '', regex=True)
print(df)
7.2 ํ ํฐํ์ ๋ถ์ฉ์ด ์ ๊ฑฐ¶
import re
# ๊ฐ๋จํ ํ ํฐํ
df = pd.DataFrame({
'text': ['This is a sample text.', 'Another example here.']
})
df['tokens'] = df['text'].str.lower().str.split()
# ๋ถ์ฉ์ด ์ ๊ฑฐ
stopwords = {'a', 'an', 'the', 'is', 'this', 'here'}
df['filtered'] = df['tokens'].apply(
lambda x: [word for word in x if word not in stopwords] if x else []
)
print(df)
8. ์ ์ฒ๋ฆฌ ํ์ดํ๋ผ์ธ¶
8.1 sklearn Pipeline¶
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Example ๋ฐ์ดํฐ
df = pd.DataFrame({
'age': [25, None, 35, 45, None],
'salary': [50000, 60000, None, 80000, 70000],
'department': ['IT', 'HR', 'IT', None, 'Sales']
})
# ์์นํ ํ์ดํ๋ผ์ธ
numeric_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# ๋ฒ์ฃผํ ํ์ดํ๋ผ์ธ
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
# ์ ์ฒด ์ ์ฒ๋ฆฌ๊ธฐ
preprocessor = ColumnTransformer([
('numeric', numeric_pipeline, ['age', 'salary']),
('categorical', categorical_pipeline, ['department'])
])
# ๋ณํ ์คํ
X_transformed = preprocessor.fit_transform(df)
print(X_transformed)
8.2 ์ฌ์ฉ์ ์ ์ ๋ณํ๊ธฐ¶
from sklearn.base import BaseEstimator, TransformerMixin
class OutlierRemover(BaseEstimator, TransformerMixin):
def __init__(self, factor=1.5):
self.factor = factor
def fit(self, X, y=None):
Q1 = np.percentile(X, 25, axis=0)
Q3 = np.percentile(X, 75, axis=0)
IQR = Q3 - Q1
self.lower_ = Q1 - self.factor * IQR
self.upper_ = Q3 + self.factor * IQR
return self
def transform(self, X):
X_clipped = np.clip(X, self.lower_, self.upper_)
return X_clipped
# Usage
remover = OutlierRemover(factor=1.5)
data = np.array([[1], [2], [3], [100], [4], [5]])
transformed = remover.fit_transform(data)
print(transformed)
Practice Problems¶
Problem 1: ๊ฒฐ์ธก์น ์ฒ๋ฆฌ¶
๋ค์ ๋ฐ์ดํฐ์ ๊ฒฐ์ธก์น๋ฅผ ์ ์ ํ ์ฒ๋ฆฌํ์ธ์.
df = pd.DataFrame({
'A': [1, 2, None, 4, 5],
'B': [None, 'X', 'Y', 'X', None]
})
# Solution
df['A'] = df['A'].fillna(df['A'].median())
df['B'] = df['B'].fillna(df['B'].mode()[0])
print(df)
Problem 2: ์ด์์น ํ์ง¶
IQR ๋ฐฉ๋ฒ์ผ๋ก ์ด์์น๋ฅผ ์ฐพ๊ณ ์ ๊ฑฐํ์ธ์.
df = pd.DataFrame({
'value': [10, 12, 11, 13, 100, 11, 12, 10]
})
# Solution
Q1 = df['value'].quantile(0.25)
Q3 = df['value'].quantile(0.75)
IQR = Q3 - Q1
df_clean = df[(df['value'] >= Q1 - 1.5 * IQR) &
(df['value'] <= Q3 + 1.5 * IQR)]
print(df_clean)
Problem 3: ์ธ์ฝ๋ฉ¶
๋ฒ์ฃผํ ๋ณ์๋ฅผ ์-ํซ ์ธ์ฝ๋ฉํ์ธ์.
df = pd.DataFrame({
'color': ['red', 'blue', 'green', 'red']
})
# Solution
df_encoded = pd.get_dummies(df, columns=['color'], prefix='color')
print(df_encoded)
Summary¶
| ๊ธฐ๋ฅ | ๋ฐฉ๋ฒ |
|---|---|
| ๊ฒฐ์ธก์น ํ์ธ | isna(), isnull() |
| ๊ฒฐ์ธก์น ์ฒ๋ฆฌ | dropna(), fillna(), interpolate() |
| ์ด์์น ํ์ง | IQR, Z-score, ๋ฐ์คํ๋กฏ |
| ์ ๊ทํ/ํ์คํ | MinMaxScaler, StandardScaler, RobustScaler |
| ๋ฒ์ฃผํ ์ธ์ฝ๋ฉ | LabelEncoder, OneHotEncoder, get_dummies() |
| ์์นํ ๋ณํ | ๋ก๊ทธ ๋ณํ, Box-Cox, ๊ตฌ๊ฐํ |
| ๋ ์ง ์ฒ๋ฆฌ | to_datetime(), dt ์ ๊ทผ์ |