7. Descriptive Stats & EDA

7. Descriptive Stats & EDA

Previous: Data Preprocessing | Next: Data Visualization Basics

Overview

Descriptive statistics summarize data characteristics, and EDA (Exploratory Data Analysis) is the process of visualizing and exploring data to discover patterns and insights.


1. ๊ธฐ์ˆ ํ†ต๊ณ„๋Ÿ‰

1.1 ์ค‘์‹ฌ ๊ฒฝํ–ฅ ์ธก๋„

import pandas as pd
import numpy as np
from scipy import stats

data = [10, 15, 20, 25, 30, 35, 40, 100]  # ์ด์ƒ์น˜ ํฌํ•จ
s = pd.Series(data)

# ํ‰๊ท  (Mean)
print(f"ํ‰๊ท : {s.mean():.2f}")  # 34.38

# ์ค‘์•™๊ฐ’ (Median)
print(f"์ค‘์•™๊ฐ’: {s.median():.2f}")  # 27.50

# ์ตœ๋นˆ๊ฐ’ (Mode)
data_with_mode = [1, 2, 2, 3, 3, 3, 4, 4]
print(f"์ตœ๋นˆ๊ฐ’: {pd.Series(data_with_mode).mode().values}")  # [3]

# ์ ˆ์‚ฌํ‰๊ท  (Trimmed Mean) - ์ด์ƒ์น˜ ์˜ํ–ฅ ๊ฐ์†Œ
print(f"์ ˆ์‚ฌํ‰๊ท (10%): {stats.trim_mean(data, 0.1):.2f}")

# ๊ฐ€์ค‘ํ‰๊ท 
values = [10, 20, 30]
weights = [0.2, 0.3, 0.5]
weighted_mean = np.average(values, weights=weights)
print(f"๊ฐ€์ค‘ํ‰๊ท : {weighted_mean}")

1.2 ์‚ฐํฌ ์ธก๋„

s = pd.Series([10, 15, 20, 25, 30, 35, 40])

# ๋ฒ”์œ„ (Range)
print(f"๋ฒ”์œ„: {s.max() - s.min()}")

# ๋ถ„์‚ฐ (Variance)
print(f"๋ถ„์‚ฐ(ํ‘œ๋ณธ): {s.var():.2f}")
print(f"๋ถ„์‚ฐ(๋ชจ์ง‘๋‹จ): {s.var(ddof=0):.2f}")

# ํ‘œ์ค€ํŽธ์ฐจ (Standard Deviation)
print(f"ํ‘œ์ค€ํŽธ์ฐจ(ํ‘œ๋ณธ): {s.std():.2f}")
print(f"ํ‘œ์ค€ํŽธ์ฐจ(๋ชจ์ง‘๋‹จ): {s.std(ddof=0):.2f}")

# ์‚ฌ๋ถ„์œ„๋ฒ”์œ„ (IQR)
Q1 = s.quantile(0.25)
Q3 = s.quantile(0.75)
IQR = Q3 - Q1
print(f"IQR: {IQR}")

# ๋ณ€๋™๊ณ„์ˆ˜ (CV) - ์ƒ๋Œ€์  ์‚ฐํฌ
cv = s.std() / s.mean()
print(f"๋ณ€๋™๊ณ„์ˆ˜: {cv:.4f}")

# ํ‰๊ท ์ ˆ๋Œ€ํŽธ์ฐจ (MAD)
mad = (s - s.mean()).abs().mean()
print(f"MAD: {mad:.2f}")

1.3 ๋ถ„ํฌ ํ˜•ํƒœ ์ธก๋„

s = pd.Series([1, 2, 2, 3, 3, 3, 4, 4, 5, 10])

# ์™œ๋„ (Skewness) - ๋น„๋Œ€์นญ ์ •๋„
print(f"์™œ๋„: {s.skew():.4f}")
# > 0: ์˜ค๋ฅธ์ชฝ ๊ผฌ๋ฆฌ (์–‘์˜ ์™œ๋„)
# < 0: ์™ผ์ชฝ ๊ผฌ๋ฆฌ (์Œ์˜ ์™œ๋„)
# = 0: ๋Œ€์นญ

# ์ฒจ๋„ (Kurtosis) - ๊ผฌ๋ฆฌ ๋‘๊ป˜
print(f"์ฒจ๋„: {s.kurtosis():.4f}")
# > 0: ์ •๊ทœ๋ถ„ํฌ๋ณด๋‹ค ๋พฐ์กฑ (๋‘๊บผ์šด ๊ผฌ๋ฆฌ)
# < 0: ์ •๊ทœ๋ถ„ํฌ๋ณด๋‹ค ํ‰ํ‰ (์–‡์€ ๊ผฌ๋ฆฌ)
# = 0: ์ •๊ทœ๋ถ„ํฌ์™€ ์œ ์‚ฌ

# scipy ์‚ฌ์šฉ (Fisher ์ •์˜)
print(f"์™œ๋„ (scipy): {stats.skew(s):.4f}")
print(f"์ฒจ๋„ (scipy): {stats.kurtosis(s):.4f}")

1.4 ๋ฐฑ๋ถ„์œ„์ˆ˜์™€ ๋ถ„์œ„์ˆ˜

s = pd.Series(range(1, 101))

# ๋ฐฑ๋ถ„์œ„์ˆ˜
print(f"25๋ฒˆ์งธ ๋ฐฑ๋ถ„์œ„์ˆ˜: {s.quantile(0.25)}")
print(f"50๋ฒˆ์งธ ๋ฐฑ๋ถ„์œ„์ˆ˜: {s.quantile(0.50)}")
print(f"75๋ฒˆ์งธ ๋ฐฑ๋ถ„์œ„์ˆ˜: {s.quantile(0.75)}")
print(f"90๋ฒˆ์งธ ๋ฐฑ๋ถ„์œ„์ˆ˜: {s.quantile(0.90)}")

# ์—ฌ๋Ÿฌ ๋ถ„์œ„์ˆ˜ ํ•œ๋ฒˆ์—
print(s.quantile([0.1, 0.25, 0.5, 0.75, 0.9]))

# 5์ˆ˜ ์š”์•ฝ (Five-number summary)
print("5์ˆ˜ ์š”์•ฝ:")
print(f"์ตœ์†Œ: {s.min()}")
print(f"Q1: {s.quantile(0.25)}")
print(f"์ค‘์•™๊ฐ’: {s.median()}")
print(f"Q3: {s.quantile(0.75)}")
print(f"์ตœ๋Œ€: {s.max()}")

2. DataFrame ๊ธฐ์ˆ ํ†ต๊ณ„

2.1 describe ๋ฉ”์„œ๋“œ

df = pd.DataFrame({
    'age': [25, 30, 35, 40, 45, 50, 55],
    'salary': [40000, 45000, 50000, 60000, 70000, 80000, 100000],
    'department': ['IT', 'HR', 'IT', 'Sales', 'IT', 'HR', 'Sales']
})

# ์ˆ˜์น˜ํ˜• ์—ด๋งŒ
print(df.describe())

# ๋ชจ๋“  ์—ด (๋ฒ”์ฃผํ˜• ํฌํ•จ)
print(df.describe(include='all'))

# ํŠน์ • ๋ฐฑ๋ถ„์œ„์ˆ˜ ์ง€์ •
print(df.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]))

2.2 ์ƒ๊ด€๋ถ„์„

df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [2, 4, 5, 4, 5],
    'C': [5, 4, 3, 2, 1]
})

# ํ”ผ์–ด์Šจ ์ƒ๊ด€๊ณ„์ˆ˜ (์„ ํ˜• ๊ด€๊ณ„)
print("ํ”ผ์–ด์Šจ ์ƒ๊ด€๊ณ„์ˆ˜:")
print(df.corr())

# ์Šคํ”ผ์–ด๋งŒ ์ƒ๊ด€๊ณ„์ˆ˜ (์ˆœ์œ„ ๊ธฐ๋ฐ˜, ๋น„์„ ํ˜• ๊ด€๊ณ„)
print("\n์Šคํ”ผ์–ด๋งŒ ์ƒ๊ด€๊ณ„์ˆ˜:")
print(df.corr(method='spearman'))

# ์ผ„๋‹ฌ ์ƒ๊ด€๊ณ„์ˆ˜ (์ˆœ์œ„ ๊ธฐ๋ฐ˜)
print("\n์ผ„๋‹ฌ ์ƒ๊ด€๊ณ„์ˆ˜:")
print(df.corr(method='kendall'))

# ํŠน์ • ์—ด ๊ฐ„ ์ƒ๊ด€๊ณ„์ˆ˜
print(f"\nA์™€ B์˜ ์ƒ๊ด€๊ณ„์ˆ˜: {df['A'].corr(df['B']):.4f}")

# p-value์™€ ํ•จ๊ป˜
from scipy.stats import pearsonr, spearmanr
corr, p_value = pearsonr(df['A'], df['B'])
print(f"์ƒ๊ด€๊ณ„์ˆ˜: {corr:.4f}, p-value: {p_value:.4f}")

2.3 ๊ณต๋ถ„์‚ฐ

df = pd.DataFrame({
    'X': [1, 2, 3, 4, 5],
    'Y': [2, 4, 5, 4, 5]
})

# ๊ณต๋ถ„์‚ฐ ํ–‰๋ ฌ
print(df.cov())

# ๋‘ ๋ณ€์ˆ˜ ๊ฐ„ ๊ณต๋ถ„์‚ฐ
print(f"X์™€ Y์˜ ๊ณต๋ถ„์‚ฐ: {df['X'].cov(df['Y']):.4f}")

3. ํƒ์ƒ‰์  ๋ฐ์ดํ„ฐ ๋ถ„์„ (EDA)

3.1 ๋ฐ์ดํ„ฐ ๊ฐœ์š” ํŒŒ์•…

import pandas as pd
import numpy as np

# ์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
np.random.seed(42)
df = pd.DataFrame({
    'id': range(1, 1001),
    'age': np.random.randint(18, 70, 1000),
    'income': np.random.lognormal(10, 1, 1000),
    'gender': np.random.choice(['M', 'F'], 1000),
    'category': np.random.choice(['A', 'B', 'C', 'D'], 1000),
    'score': np.random.normal(75, 15, 1000)
})
df.loc[np.random.choice(1000, 50), 'income'] = np.nan  # ๊ฒฐ์ธก์น˜ ์ถ”๊ฐ€

# 1. ๊ธฐ๋ณธ ์ •๋ณด
print("="*50)
print("1. ๋ฐ์ดํ„ฐ ๊ธฐ๋ณธ ์ •๋ณด")
print("="*50)
print(f"ํ–‰ ์ˆ˜: {len(df)}")
print(f"์—ด ์ˆ˜: {len(df.columns)}")
print(f"\n์ปฌ๋Ÿผ ๋ชฉ๋ก: {df.columns.tolist()}")
print(f"\n๋ฐ์ดํ„ฐ ํƒ€์ž…:\n{df.dtypes}")

# 2. ๊ฒฐ์ธก์น˜ ํ˜„ํ™ฉ
print("\n" + "="*50)
print("2. ๊ฒฐ์ธก์น˜ ํ˜„ํ™ฉ")
print("="*50)
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    '๊ฒฐ์ธก์น˜ ์ˆ˜': missing,
    '๊ฒฐ์ธก์น˜ ๋น„์œจ(%)': missing_pct
})
print(missing_df[missing_df['๊ฒฐ์ธก์น˜ ์ˆ˜'] > 0])

# 3. ๊ธฐ์ˆ ํ†ต๊ณ„
print("\n" + "="*50)
print("3. ์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜ ๊ธฐ์ˆ ํ†ต๊ณ„")
print("="*50)
print(df.describe())

# 4. ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ๋นˆ๋„
print("\n" + "="*50)
print("4. ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ๋นˆ๋„")
print("="*50)
for col in df.select_dtypes(include='object').columns:
    print(f"\n[{col}]")
    print(df[col].value_counts())

3.2 ๋‹จ๋ณ€๋Ÿ‰ ๋ถ„์„

import matplotlib.pyplot as plt

# ์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove('id')

fig, axes = plt.subplots(2, len(numeric_cols), figsize=(15, 8))

for i, col in enumerate(numeric_cols):
    # ํžˆ์Šคํ† ๊ทธ๋žจ
    axes[0, i].hist(df[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
    axes[0, i].set_title(f'{col} - Histogram')
    axes[0, i].set_xlabel(col)
    axes[0, i].set_ylabel('Frequency')

    # ๋ฐ•์Šคํ”Œ๋กฏ
    axes[1, i].boxplot(df[col].dropna())
    axes[1, i].set_title(f'{col} - Boxplot')

plt.tight_layout()
plt.show()

# ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜
categorical_cols = df.select_dtypes(include='object').columns.tolist()

fig, axes = plt.subplots(1, len(categorical_cols), figsize=(12, 4))

for i, col in enumerate(categorical_cols):
    df[col].value_counts().plot(kind='bar', ax=axes[i], edgecolor='black')
    axes[i].set_title(f'{col} - Bar Chart')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Count')
    axes[i].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

3.3 ์ด๋ณ€๋Ÿ‰ ๋ถ„์„

import seaborn as sns

# ์ˆ˜์น˜ํ˜• vs ์ˆ˜์น˜ํ˜•: ์‚ฐ์ ๋„
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].scatter(df['age'], df['income'], alpha=0.5)
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Income')
axes[0].set_title('Age vs Income')

axes[1].scatter(df['age'], df['score'], alpha=0.5)
axes[1].set_xlabel('Age')
axes[1].set_ylabel('Score')
axes[1].set_title('Age vs Score')

plt.tight_layout()
plt.show()

# ๋ฒ”์ฃผํ˜• vs ์ˆ˜์น˜ํ˜•: ๋ฐ•์Šคํ”Œ๋กฏ
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

df.boxplot(column='income', by='gender', ax=axes[0])
axes[0].set_title('Income by Gender')

df.boxplot(column='score', by='category', ax=axes[1])
axes[1].set_title('Score by Category')

plt.suptitle('')  # ์ž๋™ ์ƒ์„ฑ๋œ ์ œ๋ชฉ ์ œ๊ฑฐ
plt.tight_layout()
plt.show()

# ๋ฒ”์ฃผํ˜• vs ๋ฒ”์ฃผํ˜•: ๊ต์ฐจํ‘œ
print("Gender์™€ Category ๊ต์ฐจํ‘œ:")
print(pd.crosstab(df['gender'], df['category']))
print(pd.crosstab(df['gender'], df['category'], normalize='index'))  # ํ–‰ ๊ธฐ์ค€ ๋น„์œจ

3.4 ๋‹ค๋ณ€๋Ÿ‰ ๋ถ„์„

# ์ƒ๊ด€ํ–‰๋ ฌ ํžˆํŠธ๋งต
numeric_df = df[['age', 'income', 'score']].dropna()
correlation_matrix = numeric_df.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm',
            center=0, vmin=-1, vmax=1, fmt='.2f')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

# ์‚ฐ์ ๋„ ํ–‰๋ ฌ (Pair Plot)
sns.pairplot(df[['age', 'income', 'score', 'gender']].dropna(),
             hue='gender', diag_kind='hist')
plt.suptitle('Pair Plot', y=1.02)
plt.tight_layout()
plt.show()

4. ๋ถ„ํฌ ๋ถ„์„

4.1 ๋ถ„ํฌ ํ™•์ธ

from scipy import stats

data = df['income'].dropna()

# ์ •๊ทœ์„ฑ ๊ฒ€์ • - Shapiro-Wilk (n < 5000)
if len(data) < 5000:
    stat, p_value = stats.shapiro(data[:5000])
    print(f"Shapiro-Wilk ๊ฒ€์ •: ํ†ต๊ณ„๋Ÿ‰={stat:.4f}, p-value={p_value:.4f}")

# ์ •๊ทœ์„ฑ ๊ฒ€์ • - Kolmogorov-Smirnov
stat, p_value = stats.kstest(data, 'norm',
                             args=(data.mean(), data.std()))
print(f"K-S ๊ฒ€์ •: ํ†ต๊ณ„๋Ÿ‰={stat:.4f}, p-value={p_value:.4f}")

# ์ •๊ทœ์„ฑ ๊ฒ€์ • - Anderson-Darling
result = stats.anderson(data, dist='norm')
print(f"Anderson-Darling ๊ฒ€์ •: ํ†ต๊ณ„๋Ÿ‰={result.statistic:.4f}")

# ์‹œ๊ฐ์  ํ™•์ธ: Q-Q plot
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# ํžˆ์Šคํ† ๊ทธ๋žจ๊ณผ ์ •๊ทœ๋ถ„ํฌ ๊ณก์„ 
axes[0].hist(data, bins=50, density=True, alpha=0.7, edgecolor='black')
x = np.linspace(data.min(), data.max(), 100)
axes[0].plot(x, stats.norm.pdf(x, data.mean(), data.std()),
             'r-', linewidth=2, label='Normal')
axes[0].set_title('Histogram with Normal Curve')
axes[0].legend()

# Q-Q plot
stats.probplot(data, dist="norm", plot=axes[1])
axes[1].set_title('Q-Q Plot')

plt.tight_layout()
plt.show()

4.2 ๋ถ„ํฌ ๋ณ€ํ™˜

# ๋กœ๊ทธ ์ •๊ทœ๋ถ„ํฌ ๋ฐ์ดํ„ฐ
data = df['income'].dropna()

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# ์›๋ณธ
axes[0, 0].hist(data, bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title(f'Original (Skew: {stats.skew(data):.2f})')

# ๋กœ๊ทธ ๋ณ€ํ™˜
log_data = np.log1p(data)
axes[0, 1].hist(log_data, bins=50, edgecolor='black', alpha=0.7)
axes[0, 1].set_title(f'Log Transform (Skew: {stats.skew(log_data):.2f})')

# ์ œ๊ณฑ๊ทผ ๋ณ€ํ™˜
sqrt_data = np.sqrt(data)
axes[1, 0].hist(sqrt_data, bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].set_title(f'Square Root Transform (Skew: {stats.skew(sqrt_data):.2f})')

# Box-Cox ๋ณ€ํ™˜
boxcox_data, lambda_param = stats.boxcox(data)
axes[1, 1].hist(boxcox_data, bins=50, edgecolor='black', alpha=0.7)
axes[1, 1].set_title(f'Box-Cox Transform (ฮป={lambda_param:.2f}, Skew: {stats.skew(boxcox_data):.2f})')

plt.tight_layout()
plt.show()

5. ๊ทธ๋ฃน๋ณ„ ๋ถ„์„

5.1 ๊ทธ๋ฃน๋ณ„ ๊ธฐ์ˆ ํ†ต๊ณ„

# ๊ทธ๋ฃน๋ณ„ ์š”์•ฝ
print("์„ฑ๋ณ„ ๊ทธ๋ฃน๋ณ„ ํ†ต๊ณ„:")
print(df.groupby('gender')[['age', 'income', 'score']].agg(['mean', 'std', 'count']))

print("\n์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ํ†ต๊ณ„:")
print(df.groupby('category')[['income', 'score']].describe())

# ๋‹ค์ค‘ ๊ทธ๋ฃน
print("\n์„ฑ๋ณ„ & ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ํ‰๊ท  ์†Œ๋“:")
print(df.groupby(['gender', 'category'])['income'].mean().unstack())

5.2 ๊ทธ๋ฃน ๊ฐ„ ๋น„๊ต

# t-๊ฒ€์ • (๋‘ ๊ทธ๋ฃน)
male_income = df[df['gender'] == 'M']['income'].dropna()
female_income = df[df['gender'] == 'F']['income'].dropna()

stat, p_value = stats.ttest_ind(male_income, female_income)
print(f"๋…๋ฆฝํ‘œ๋ณธ t-๊ฒ€์ •: t={stat:.4f}, p-value={p_value:.4f}")

# ANOVA (์„ธ ๊ทธ๋ฃน ์ด์ƒ)
groups = [df[df['category'] == cat]['score'].dropna() for cat in df['category'].unique()]
stat, p_value = stats.f_oneway(*groups)
print(f"ANOVA: F={stat:.4f}, p-value={p_value:.4f}")

# ์นด์ด์ œ๊ณฑ ๊ฒ€์ • (๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜)
contingency_table = pd.crosstab(df['gender'], df['category'])
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
print(f"์นด์ด์ œ๊ณฑ ๊ฒ€์ •: ฯ‡ยฒ={chi2:.4f}, p-value={p_value:.4f}")

6. EDA ์ž๋™ํ™” ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ

6.1 pandas-profiling (ydata-profiling)

# pip install ydata-profiling

# from ydata_profiling import ProfileReport

# report = ProfileReport(df, title="EDA Report", explorative=True)
# report.to_file("eda_report.html")

# ๊ฐ„์†Œํ™” ๋ฒ„์ „ (๋Œ€์šฉ๋Ÿ‰ ๋ฐ์ดํ„ฐ)
# report = ProfileReport(df, minimal=True)

6.2 sweetviz

# pip install sweetviz

# import sweetviz as sv

# report = sv.analyze(df)
# report.show_html("sweetviz_report.html")

# ๋‘ ๋ฐ์ดํ„ฐ์…‹ ๋น„๊ต
# report = sv.compare(df1, df2)

7. EDA ์ฒดํฌ๋ฆฌ์ŠคํŠธ

## EDA ์ฒดํฌ๋ฆฌ์ŠคํŠธ

### 1. ๋ฐ์ดํ„ฐ ๊ฐœ์š”
- [ ] ํ–‰/์—ด ์ˆ˜ ํ™•์ธ
- [ ] ๋ฐ์ดํ„ฐ ํƒ€์ž… ํ™•์ธ
- [ ] ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰ ํ™•์ธ

### 2. ๊ฒฐ์ธก์น˜
- [ ] ๊ฒฐ์ธก์น˜ ์กด์žฌ ์—ฌ๋ถ€
- [ ] ๊ฒฐ์ธก์น˜ ๋น„์œจ
- [ ] ๊ฒฐ์ธก์น˜ ํŒจํ„ด (MCAR, MAR, MNAR)

### 3. ์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜
- [ ] ๊ธฐ์ˆ ํ†ต๊ณ„ (ํ‰๊ท , ์ค‘์•™๊ฐ’, ํ‘œ์ค€ํŽธ์ฐจ ๋“ฑ)
- [ ] ๋ถ„ํฌ ํ˜•ํƒœ (์™œ๋„, ์ฒจ๋„)
- [ ] ์ด์ƒ์น˜ ์กด์žฌ ์—ฌ๋ถ€
- [ ] ํžˆ์Šคํ† ๊ทธ๋žจ/๋ฐ•์Šคํ”Œ๋กฏ

### 4. ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜
- [ ] ์นดํ…Œ๊ณ ๋ฆฌ ์ˆ˜
- [ ] ๋นˆ๋„ ๋ถ„ํฌ
- [ ] ํฌ์†Œ ์นดํ…Œ๊ณ ๋ฆฌ ์กด์žฌ ์—ฌ๋ถ€

### 5. ๋ณ€์ˆ˜ ๊ฐ„ ๊ด€๊ณ„
- [ ] ์ƒ๊ด€๋ถ„์„ (์ˆ˜์น˜ํ˜•)
- [ ] ๊ต์ฐจํ‘œ (๋ฒ”์ฃผํ˜•)
- [ ] ๊ทธ๋ฃน๋ณ„ ๋น„๊ต

### 6. ํƒ€๊ฒŸ ๋ณ€์ˆ˜
- [ ] ํƒ€๊ฒŸ ๋ถ„ํฌ (๋ถˆ๊ท ํ˜• ์—ฌ๋ถ€)
- [ ] ํƒ€๊ฒŸ๊ณผ ํŠน์„ฑ ๊ฐ„ ๊ด€๊ณ„

Practice Problems

Problem 1: ๊ธฐ์ˆ ํ†ต๊ณ„

๋‹ค์Œ ๋ฐ์ดํ„ฐ์˜ 5์ˆ˜ ์š”์•ฝ์„ ๊ตฌํ•˜์„ธ์š”.

data = [12, 15, 18, 22, 25, 28, 30, 35, 40, 100]
s = pd.Series(data)

# Solution
print(f"์ตœ์†Œ: {s.min()}")
print(f"Q1: {s.quantile(0.25)}")
print(f"์ค‘์•™๊ฐ’: {s.median()}")
print(f"Q3: {s.quantile(0.75)}")
print(f"์ตœ๋Œ€: {s.max()}")

Problem 2: ์ƒ๊ด€๋ถ„์„

๋‘ ๋ณ€์ˆ˜ ๊ฐ„์˜ ์ƒ๊ด€๊ณ„์ˆ˜๋ฅผ ๊ตฌํ•˜๊ณ  ํ•ด์„ํ•˜์„ธ์š”.

df = pd.DataFrame({
    'study_hours': [1, 2, 3, 4, 5, 6, 7, 8],
    'score': [50, 55, 60, 65, 70, 80, 85, 95]
})

# Solution
corr = df['study_hours'].corr(df['score'])
print(f"์ƒ๊ด€๊ณ„์ˆ˜: {corr:.4f}")
# ๊ฐ•ํ•œ ์–‘์˜ ์ƒ๊ด€๊ด€๊ณ„ (๊ณต๋ถ€ ์‹œ๊ฐ„์ด ๋Š˜์ˆ˜๋ก ์ ์ˆ˜ ์ฆ๊ฐ€)

Problem 3: ๊ทธ๋ฃน ๋น„๊ต

๊ทธ๋ฃน๋ณ„ ํ‰๊ท ๊ณผ ํ‘œ์ค€ํŽธ์ฐจ๋ฅผ ๊ตฌํ•˜์„ธ์š”.

df = pd.DataFrame({
    'group': ['A', 'A', 'B', 'B', 'A', 'B'],
    'value': [10, 12, 20, 22, 11, 21]
})

# Solution
print(df.groupby('group')['value'].agg(['mean', 'std']))

Summary

Measure Type Measure Function
์ค‘์‹ฌ ๊ฒฝํ–ฅ ํ‰๊ท , ์ค‘์•™๊ฐ’, ์ตœ๋นˆ๊ฐ’ mean(), median(), mode()
์‚ฐํฌ ๋ถ„์‚ฐ, ํ‘œ์ค€ํŽธ์ฐจ, IQR var(), std(), quantile()
๋ถ„ํฌ ํ˜•ํƒœ ์™œ๋„, ์ฒจ๋„ skew(), kurtosis()
๊ด€๊ณ„ ์ƒ๊ด€๊ณ„์ˆ˜, ๊ณต๋ถ„์‚ฐ corr(), cov()
์š”์•ฝ ๊ธฐ์ˆ ํ†ต๊ณ„ describe()
to navigate between lessons