25. Practical Projects - Comprehensive Data Analysis Practice

25. Practical Projects - Comprehensive Data Analysis Practice

Previous: Experimental Design

Overview

This is a comprehensive EDA practice using real datasets. It covers the entire analysis process from data loading to deriving insights.


ํ”„๋กœ์ ํŠธ 1: Titanic ์ƒ์กด ๋ถ„์„

1.1 ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ ๋ฐ ๊ฐœ์š”

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Seaborn ๋‚ด์žฅ ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ
titanic = sns.load_dataset('titanic')

# ๋ฐ์ดํ„ฐ ๊ฐœ์š”
print("="*50)
print("๋ฐ์ดํ„ฐ ๊ธฐ๋ณธ ์ •๋ณด")
print("="*50)
print(f"ํ–‰ ์ˆ˜: {len(titanic)}")
print(f"์—ด ์ˆ˜: {len(titanic.columns)}")
print(f"\n์ปฌ๋Ÿผ ๋ชฉ๋ก:\n{titanic.columns.tolist()}")
print(f"\n๋ฐ์ดํ„ฐ ํƒ€์ž…:\n{titanic.dtypes}")

# ์ฒ˜์Œ 5ํ–‰
print("\n" + "="*50)
print("๋ฐ์ดํ„ฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ")
print("="*50)
print(titanic.head())

1.2 ๊ฒฐ์ธก์น˜ ๋ถ„์„

print("="*50)
print("๊ฒฐ์ธก์น˜ ๋ถ„์„")
print("="*50)

# ๊ฒฐ์ธก์น˜ ํ˜„ํ™ฉ
missing = titanic.isnull().sum()
missing_pct = (missing / len(titanic) * 100).round(2)
missing_df = pd.DataFrame({
    '๊ฒฐ์ธก์น˜ ์ˆ˜': missing,
    '๊ฒฐ์ธก์น˜ ๋น„์œจ(%)': missing_pct
}).sort_values('๊ฒฐ์ธก์น˜ ๋น„์œจ(%)', ascending=False)

print(missing_df[missing_df['๊ฒฐ์ธก์น˜ ์ˆ˜'] > 0])

# ์‹œ๊ฐํ™”
fig, ax = plt.subplots(figsize=(10, 6))
missing_cols = missing_df[missing_df['๊ฒฐ์ธก์น˜ ์ˆ˜'] > 0].index
missing_vals = missing_df.loc[missing_cols, '๊ฒฐ์ธก์น˜ ๋น„์œจ(%)']
ax.barh(missing_cols, missing_vals, color='coral')
ax.set_xlabel('๊ฒฐ์ธก์น˜ ๋น„์œจ (%)')
ax.set_title('๊ฒฐ์ธก์น˜ ํ˜„ํ™ฉ')
for i, v in enumerate(missing_vals):
    ax.text(v + 0.5, i, f'{v}%', va='center')
plt.tight_layout()
plt.show()

1.3 ํƒ€๊ฒŸ ๋ณ€์ˆ˜ ๋ถ„์„

print("="*50)
print("ํƒ€๊ฒŸ ๋ณ€์ˆ˜ (์ƒ์กด ์—ฌ๋ถ€) ๋ถ„์„")
print("="*50)

# ์ƒ์กด์œจ
survival_rate = titanic['survived'].value_counts(normalize=True)
print(f"์ƒ์กด์œจ: {survival_rate[1]:.1%}")
print(f"์‚ฌ๋ง์œจ: {survival_rate[0]:.1%}")

# ์‹œ๊ฐํ™”
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# ๋นˆ๋„
titanic['survived'].value_counts().plot(kind='bar', ax=axes[0],
                                         color=['coral', 'steelblue'])
axes[0].set_title('์ƒ์กด ์—ฌ๋ถ€ ๋นˆ๋„')
axes[0].set_xticklabels(['์‚ฌ๋ง', '์ƒ์กด'], rotation=0)
axes[0].set_ylabel('์ธ์› ์ˆ˜')

# ๋น„์œจ
titanic['survived'].value_counts().plot(kind='pie', ax=axes[1],
                                         autopct='%1.1f%%',
                                         colors=['coral', 'steelblue'],
                                         labels=['์‚ฌ๋ง', '์ƒ์กด'])
axes[1].set_title('์ƒ์กด ์—ฌ๋ถ€ ๋น„์œจ')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

1.4 ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜๋ณ„ ์ƒ์กด์œจ

print("="*50)
print("๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜๋ณ„ ์ƒ์กด์œจ")
print("="*50)

categorical_vars = ['sex', 'pclass', 'embarked', 'alone']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for ax, var in zip(axes.flat, categorical_vars):
    survival_by_var = titanic.groupby(var)['survived'].mean().sort_values(ascending=False)
    survival_by_var.plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
    ax.set_title(f'{var}๋ณ„ ์ƒ์กด์œจ')
    ax.set_ylabel('์ƒ์กด์œจ')
    ax.set_ylim(0, 1)
    ax.tick_params(axis='x', rotation=45)

    # ๊ฐ’ ํ‘œ์‹œ
    for i, v in enumerate(survival_by_var):
        ax.text(i, v + 0.02, f'{v:.1%}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# ํ†ต๊ณ„ ์š”์•ฝ
print("\n์„ฑ๋ณ„ ์ƒ์กด์œจ:")
print(titanic.groupby('sex')['survived'].agg(['mean', 'count']))

print("\n๊ฐ์‹ค ๋“ฑ๊ธ‰๋ณ„ ์ƒ์กด์œจ:")
print(titanic.groupby('pclass')['survived'].agg(['mean', 'count']))

1.5 ์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜ ๋ถ„์„

print("="*50)
print("์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜ ๋ถ„์„")
print("="*50)

numeric_vars = ['age', 'fare']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for i, var in enumerate(numeric_vars):
    # ํžˆ์Šคํ† ๊ทธ๋žจ (์ƒ์กด ์—ฌ๋ถ€๋ณ„)
    for survived, label, color in [(0, '์‚ฌ๋ง', 'coral'), (1, '์ƒ์กด', 'steelblue')]:
        data = titanic[titanic['survived'] == survived][var].dropna()
        axes[i, 0].hist(data, bins=30, alpha=0.6, label=label, color=color)
    axes[i, 0].set_title(f'{var} ๋ถ„ํฌ (์ƒ์กด ์—ฌ๋ถ€๋ณ„)')
    axes[i, 0].set_xlabel(var)
    axes[i, 0].legend()

    # ๋ฐ•์Šคํ”Œ๋กฏ
    titanic.boxplot(column=var, by='survived', ax=axes[i, 1])
    axes[i, 1].set_title(f'{var} (์ƒ์กด ์—ฌ๋ถ€๋ณ„)')
    axes[i, 1].set_xlabel('์ƒ์กด ์—ฌ๋ถ€')

plt.suptitle('')
plt.tight_layout()
plt.show()

# ํ†ต๊ณ„ ์š”์•ฝ
print("\n๋‚˜์ด๋ณ„ ์ƒ์กด ํ†ต๊ณ„:")
print(titanic.groupby('survived')['age'].describe())

1.6 ๋‹ค๋ณ€๋Ÿ‰ ๋ถ„์„

print("="*50)
print("๋‹ค๋ณ€๋Ÿ‰ ๋ถ„์„")
print("="*50)

# ์„ฑ๋ณ„ & ๊ฐ์‹ค ๋“ฑ๊ธ‰๋ณ„ ์ƒ์กด์œจ
pivot = pd.pivot_table(titanic, values='survived',
                       index='pclass', columns='sex', aggfunc='mean')
print("์„ฑ๋ณ„ & ๊ฐ์‹ค ๋“ฑ๊ธ‰๋ณ„ ์ƒ์กด์œจ:")
print(pivot)

# ํžˆํŠธ๋งต
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(pivot, annot=True, cmap='RdYlGn', fmt='.1%',
            vmin=0, vmax=1, ax=ax)
ax.set_title('์„ฑ๋ณ„ & ๊ฐ์‹ค ๋“ฑ๊ธ‰๋ณ„ ์ƒ์กด์œจ')
plt.show()

# ๋‚˜์ด ๊ทธ๋ฃน ์ƒ์„ฑ
titanic['age_group'] = pd.cut(titanic['age'],
                              bins=[0, 12, 18, 35, 60, 100],
                              labels=['์–ด๋ฆฐ์ด', '์ฒญ์†Œ๋…„', '์ฒญ๋…„', '์ค‘๋…„', '๋…ธ๋…„'])

# ๋‚˜์ด ๊ทธ๋ฃน๋ณ„ ์ƒ์กด์œจ
age_survival = titanic.groupby('age_group')['survived'].mean()
print("\n๋‚˜์ด ๊ทธ๋ฃน๋ณ„ ์ƒ์กด์œจ:")
print(age_survival)

1.7 ํ†ต๊ณ„ ๊ฒ€์ •

from scipy import stats

print("="*50)
print("ํ†ต๊ณ„ ๊ฒ€์ •")
print("="*50)

# ์„ฑ๋ณ„์— ๋”ฐ๋ฅธ ์ƒ์กด์œจ ์ฐจ์ด (์นด์ด์ œ๊ณฑ ๊ฒ€์ •)
contingency = pd.crosstab(titanic['sex'], titanic['survived'])
chi2, p_value, dof, expected = stats.chi2_contingency(contingency)
print(f"\n์„ฑ๋ณ„-์ƒ์กด ์นด์ด์ œ๊ณฑ ๊ฒ€์ •:")
print(f"ฯ‡ยฒ = {chi2:.4f}, p-value = {p_value:.4f}")

# ์ƒ์กด ์—ฌ๋ถ€์— ๋”ฐ๋ฅธ ๋‚˜์ด ์ฐจ์ด (t-๊ฒ€์ •)
survived_age = titanic[titanic['survived'] == 1]['age'].dropna()
died_age = titanic[titanic['survived'] == 0]['age'].dropna()

stat, p_value = stats.ttest_ind(survived_age, died_age)
print(f"\n๋‚˜์ด-์ƒ์กด t-๊ฒ€์ •:")
print(f"t = {stat:.4f}, p-value = {p_value:.4f}")
print(f"์ƒ์กด์ž ํ‰๊ท  ๋‚˜์ด: {survived_age.mean():.1f}")
print(f"์‚ฌ๋ง์ž ํ‰๊ท  ๋‚˜์ด: {died_age.mean():.1f}")

1.8 ์ธ์‚ฌ์ดํŠธ ์ •๋ฆฌ

print("="*50)
print("์ฃผ์š” ์ธ์‚ฌ์ดํŠธ")
print("="*50)

insights = """
1. ์ „์ฒด ์ƒ์กด์œจ: ์•ฝ 38%

2. ์„ฑ๋ณ„:
   - ์—ฌ์„ฑ ์ƒ์กด์œจ(74%)์ด ๋‚จ์„ฑ(19%)๋ณด๋‹ค ํ˜„์ €ํžˆ ๋†’์Œ
   - "์—ฌ์„ฑ๊ณผ ์•„์ด ๋จผ์ €" ์›์น™์˜ ์˜ํ–ฅ

3. ๊ฐ์‹ค ๋“ฑ๊ธ‰:
   - 1๋“ฑ์„(63%) > 2๋“ฑ์„(47%) > 3๋“ฑ์„(24%)
   - ์ƒ์œ„ ๋“ฑ๊ธ‰์ผ์ˆ˜๋ก ์ƒ์กด์œจ ๋†’์Œ

4. ๋‚˜์ด:
   - ์–ด๋ฆฐ์ด ์ƒ์กด์œจ์ด ๊ฐ€์žฅ ๋†’์Œ
   - ์ƒ์กด์ž ํ‰๊ท  ๋‚˜์ด๊ฐ€ ์‚ฌ๋ง์ž๋ณด๋‹ค ์•ฝ๊ฐ„ ๋‚ฎ์Œ

5. ๋™๋ฐ˜์ž:
   - ํ˜ผ์ž ํƒ‘์Šนํ•œ ์Šน๊ฐ์˜ ์ƒ์กด์œจ์ด ๋‚ฎ์Œ

6. ์šด์ž„:
   - ๋†’์€ ์šด์ž„์„ ์ง€๋ถˆํ•œ ์Šน๊ฐ์˜ ์ƒ์กด์œจ์ด ๋†’์Œ
   - (๊ฐ์‹ค ๋“ฑ๊ธ‰๊ณผ ์ƒ๊ด€๊ด€๊ณ„)
"""
print(insights)

ํ”„๋กœ์ ํŠธ 2: ํŒ ๋ฐ์ดํ„ฐ ๋ถ„์„

2.1 ๋ฐ์ดํ„ฐ ํƒ์ƒ‰

tips = sns.load_dataset('tips')

print("="*50)
print("Tips ๋ฐ์ดํ„ฐ์…‹ ๊ฐœ์š”")
print("="*50)
print(tips.info())
print("\n๊ธฐ์ˆ  ํ†ต๊ณ„:")
print(tips.describe())

2.2 ํŒ ๊ธˆ์•ก ๋ถ„์„

# ํŒ ๋น„์œจ ๊ณ„์‚ฐ
tips['tip_pct'] = tips['tip'] / tips['total_bill'] * 100

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# ํŒ ๊ธˆ์•ก ๋ถ„ํฌ
axes[0, 0].hist(tips['tip'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(tips['tip'].mean(), color='red', linestyle='--',
                   label=f'ํ‰๊ท : ${tips["tip"].mean():.2f}')
axes[0, 0].set_title('ํŒ ๊ธˆ์•ก ๋ถ„ํฌ')
axes[0, 0].set_xlabel('ํŒ ($)')
axes[0, 0].legend()

# ํŒ ๋น„์œจ ๋ถ„ํฌ
axes[0, 1].hist(tips['tip_pct'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 1].axvline(tips['tip_pct'].mean(), color='red', linestyle='--',
                   label=f'ํ‰๊ท : {tips["tip_pct"].mean():.1f}%')
axes[0, 1].set_title('ํŒ ๋น„์œจ ๋ถ„ํฌ')
axes[0, 1].set_xlabel('ํŒ ๋น„์œจ (%)')
axes[0, 1].legend()

# ์š”์ผ๋ณ„ ํŒ
tips.groupby('day')['tip'].mean().plot(kind='bar', ax=axes[1, 0],
                                        color='steelblue', edgecolor='black')
axes[1, 0].set_title('์š”์ผ๋ณ„ ํ‰๊ท  ํŒ')
axes[1, 0].set_ylabel('ํ‰๊ท  ํŒ ($)')
axes[1, 0].tick_params(axis='x', rotation=45)

# ์‹œ๊ฐ„๋Œ€๋ณ„ ํŒ
tips.groupby('time')['tip'].mean().plot(kind='bar', ax=axes[1, 1],
                                         color='coral', edgecolor='black')
axes[1, 1].set_title('์‹œ๊ฐ„๋Œ€๋ณ„ ํ‰๊ท  ํŒ')
axes[1, 1].set_ylabel('ํ‰๊ท  ํŒ ($)')
axes[1, 1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

print(f"ํ‰๊ท  ํŒ: ${tips['tip'].mean():.2f}")
print(f"ํ‰๊ท  ํŒ ๋น„์œจ: {tips['tip_pct'].mean():.1f}%")

2.3 ์ฒญ๊ตฌ ๊ธˆ์•ก๊ณผ ํŒ์˜ ๊ด€๊ณ„

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ์‚ฐ์ ๋„
sns.scatterplot(data=tips, x='total_bill', y='tip', hue='time',
                size='size', ax=axes[0])
axes[0].set_title('์ฒญ๊ตฌ ๊ธˆ์•ก vs ํŒ')

# ํšŒ๊ท€์„ 
sns.regplot(data=tips, x='total_bill', y='tip', ax=axes[1],
            scatter_kws={'alpha': 0.5})
axes[1].set_title('์ฒญ๊ตฌ ๊ธˆ์•ก vs ํŒ (ํšŒ๊ท€์„ )')

plt.tight_layout()
plt.show()

# ์ƒ๊ด€๊ณ„์ˆ˜
corr, p_value = stats.pearsonr(tips['total_bill'], tips['tip'])
print(f"์ƒ๊ด€๊ณ„์ˆ˜: {corr:.4f} (p-value: {p_value:.4f})")

2.4 ๊ทธ๋ฃน๋ณ„ ๋น„๊ต

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# ์„ฑ๋ณ„
sns.boxplot(data=tips, x='sex', y='tip_pct', ax=axes[0, 0])
axes[0, 0].set_title('์„ฑ๋ณ„ ํŒ ๋น„์œจ')

# ํก์—ฐ ์—ฌ๋ถ€
sns.boxplot(data=tips, x='smoker', y='tip_pct', ax=axes[0, 1])
axes[0, 1].set_title('ํก์—ฐ ์—ฌ๋ถ€๋ณ„ ํŒ ๋น„์œจ')

# ์š”์ผ
sns.boxplot(data=tips, x='day', y='tip_pct', ax=axes[1, 0])
axes[1, 0].set_title('์š”์ผ๋ณ„ ํŒ ๋น„์œจ')

# ์ธ์›์ˆ˜
sns.boxplot(data=tips, x='size', y='tip_pct', ax=axes[1, 1])
axes[1, 1].set_title('์ธ์›์ˆ˜๋ณ„ ํŒ ๋น„์œจ')

plt.tight_layout()
plt.show()

# ํ†ต๊ณ„ ๊ฒ€์ •: ์„ฑ๋ณ„ ์ฐจ์ด
male_tip = tips[tips['sex'] == 'Male']['tip_pct']
female_tip = tips[tips['sex'] == 'Female']['tip_pct']
stat, p_value = stats.ttest_ind(male_tip, female_tip)
print(f"\n์„ฑ๋ณ„ ํŒ ๋น„์œจ t-๊ฒ€์ •: t={stat:.4f}, p={p_value:.4f}")

ํ”„๋กœ์ ํŠธ 3: ๋ถ„์„ ๋ณด๊ณ ์„œ ํ…œํ”Œ๋ฆฟ

def generate_eda_report(df, target=None):
    """
    EDA ๋ณด๊ณ ์„œ ์ž๋™ ์ƒ์„ฑ ํ•จ์ˆ˜

    Parameters:
    -----------
    df : DataFrame
        ๋ถ„์„ํ•  ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„
    target : str, optional
        ํƒ€๊ฒŸ ๋ณ€์ˆ˜๋ช…
    """
    print("="*60)
    print("           ํƒ์ƒ‰์  ๋ฐ์ดํ„ฐ ๋ถ„์„ (EDA) ๋ณด๊ณ ์„œ")
    print("="*60)

    # 1. ๊ธฐ๋ณธ ์ •๋ณด
    print("\n1. ๋ฐ์ดํ„ฐ ๊ธฐ๋ณธ ์ •๋ณด")
    print("-"*40)
    print(f"   ํ–‰ ์ˆ˜: {len(df):,}")
    print(f"   ์—ด ์ˆ˜: {len(df.columns)}")
    print(f"   ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # 2. ๋ฐ์ดํ„ฐ ํƒ€์ž…
    print("\n2. ๋ฐ์ดํ„ฐ ํƒ€์ž… ์š”์•ฝ")
    print("-"*40)
    dtype_counts = df.dtypes.value_counts()
    for dtype, count in dtype_counts.items():
        print(f"   {dtype}: {count}๊ฐœ")

    # 3. ๊ฒฐ์ธก์น˜
    print("\n3. ๊ฒฐ์ธก์น˜ ํ˜„ํ™ฉ")
    print("-"*40)
    missing = df.isnull().sum()
    missing_pct = (missing / len(df) * 100)
    for col, (cnt, pct) in zip(df.columns, zip(missing, missing_pct)):
        if cnt > 0:
            print(f"   {col}: {cnt}๊ฐœ ({pct:.1f}%)")
    if missing.sum() == 0:
        print("   ๊ฒฐ์ธก์น˜ ์—†์Œ")

    # 4. ์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜
    print("\n4. ์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜ ํ†ต๊ณ„")
    print("-"*40)
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        for col in numeric_cols[:5]:  # ์ƒ์œ„ 5๊ฐœ๋งŒ
            print(f"\n   [{col}]")
            print(f"   ํ‰๊ท : {df[col].mean():.2f}, ์ค‘์•™๊ฐ’: {df[col].median():.2f}")
            print(f"   ํ‘œ์ค€ํŽธ์ฐจ: {df[col].std():.2f}")
            print(f"   ๋ฒ”์œ„: [{df[col].min():.2f}, {df[col].max():.2f}]")

    # 5. ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜
    print("\n5. ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ์š”์•ฝ")
    print("-"*40)
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    for col in categorical_cols[:5]:  # ์ƒ์œ„ 5๊ฐœ๋งŒ
        print(f"\n   [{col}]")
        print(f"   ๊ณ ์œ ๊ฐ’ ์ˆ˜: {df[col].nunique()}")
        print(f"   ์ตœ๋นˆ๊ฐ’: {df[col].mode().values[0]}")

    # 6. ํƒ€๊ฒŸ ๋ณ€์ˆ˜ (์žˆ๋Š” ๊ฒฝ์šฐ)
    if target and target in df.columns:
        print(f"\n6. ํƒ€๊ฒŸ ๋ณ€์ˆ˜ ({target}) ๋ถ„์„")
        print("-"*40)
        if df[target].dtype in ['int64', 'float64']:
            print(f"   ํ‰๊ท : {df[target].mean():.2f}")
            print(f"   ๋ถ„ํฌ: ์—ฐ์†ํ˜•")
        else:
            print(f"   ํด๋ž˜์Šค ๋ถ„ํฌ:")
            for val, cnt in df[target].value_counts().items():
                print(f"   - {val}: {cnt} ({cnt/len(df)*100:.1f}%)")

    print("\n" + "="*60)
    print("                     ๋ณด๊ณ ์„œ ๋")
    print("="*60)

# Usage ์˜ˆ์‹œ
# generate_eda_report(titanic, target='survived')

๋ถ„์„ ์ฒดํฌ๋ฆฌ์ŠคํŠธ

## ๋ฐ์ดํ„ฐ ๋ถ„์„ ์ฒดํฌ๋ฆฌ์ŠคํŠธ

### 1๋‹จ๊ณ„: ๋ฐ์ดํ„ฐ ์ดํ•ด
- [ ] ๋ฐ์ดํ„ฐ ์ถœ์ฒ˜์™€ ์ˆ˜์ง‘ ๋ฐฉ๋ฒ• ํ™•์ธ
- [ ] ๊ฐ ๋ณ€์ˆ˜์˜ ์˜๋ฏธ ํŒŒ์•…
- [ ] ๋น„์ฆˆ๋‹ˆ์Šค ๋ฌธ๋งฅ ์ดํ•ด

### 2๋‹จ๊ณ„: ๋ฐ์ดํ„ฐ ํ’ˆ์งˆ ํ™•์ธ
- [ ] ๊ฒฐ์ธก์น˜ ํ™•์ธ ๋ฐ ์ฒ˜๋ฆฌ ๊ณ„ํš
- [ ] ์ด์ƒ์น˜ ํƒ์ง€
- [ ] ๋ฐ์ดํ„ฐ ํƒ€์ž… ํ™•์ธ
- [ ] ์ค‘๋ณต ๋ฐ์ดํ„ฐ ํ™•์ธ

### 3๋‹จ๊ณ„: ๋‹จ๋ณ€๋Ÿ‰ ๋ถ„์„
- [ ] ์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜ ๋ถ„ํฌ ํ™•์ธ
- [ ] ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ๋นˆ๋„ ํ™•์ธ
- [ ] ๊ธฐ์ˆ ํ†ต๊ณ„๋Ÿ‰ ๊ณ„์‚ฐ

### 4๋‹จ๊ณ„: ์ด๋ณ€๋Ÿ‰/๋‹ค๋ณ€๋Ÿ‰ ๋ถ„์„
- [ ] ๋ณ€์ˆ˜ ๊ฐ„ ์ƒ๊ด€๊ด€๊ณ„ ๋ถ„์„
- [ ] ๊ทธ๋ฃน๋ณ„ ๋น„๊ต ๋ถ„์„
- [ ] ํƒ€๊ฒŸ ๋ณ€์ˆ˜์™€์˜ ๊ด€๊ณ„ ๋ถ„์„

### 5๋‹จ๊ณ„: ํ†ต๊ณ„ ๊ฒ€์ •
- [ ] ์ ์ ˆํ•œ ๊ฒ€์ • ๋ฐฉ๋ฒ• ์„ ํƒ
- [ ] ๊ฐ€์ • ๊ฒ€์ฆ
- [ ] ๊ฒฐ๊ณผ ํ•ด์„

### 6๋‹จ๊ณ„: ์ธ์‚ฌ์ดํŠธ ๋„์ถœ
- [ ] ์ฃผ์š” ๋ฐœ๊ฒฌ ์ •๋ฆฌ
- [ ] ๋น„์ฆˆ๋‹ˆ์Šค ์˜๋ฏธ ํ•ด์„
- [ ] ์ถ”๊ฐ€ ๋ถ„์„ ์ œ์•ˆ

Summary

๋‹จ๊ณ„ ์ฃผ์š” ์ž‘์—… ๋„๊ตฌ/ํ•จ์ˆ˜
๋ฐ์ดํ„ฐ ๋กœ๋”ฉ CSV/Excel/DB ๋กœ๋“œ pd.read_*()
๊ฐœ์š” ํŒŒ์•… ํ˜•ํƒœ, ํƒ€์ž… ํ™•์ธ info(), describe()
๊ฒฐ์ธก์น˜ ํ™•์ธ ๋ฐ ์ฒ˜๋ฆฌ isna(), fillna()
๋‹จ๋ณ€๋Ÿ‰ ๋ถ„ํฌ, ๋นˆ๋„ ๋ถ„์„ histplot(), countplot()
์ด๋ณ€๋Ÿ‰ ๊ด€๊ณ„ ๋ถ„์„ scatterplot(), boxplot()
๋‹ค๋ณ€๋Ÿ‰ ํŒจํ„ด ๋ฐœ๊ฒฌ heatmap(), pairplot()
ํ†ต๊ณ„ ๊ฒ€์ • ์œ ์˜์„ฑ ๊ฒ€์ • scipy.stats
์ธ์‚ฌ์ดํŠธ ๊ฒฐ๊ณผ ์ •๋ฆฌ ๋งˆํฌ๋‹ค์šด ๋ณด๊ณ ์„œ
to navigate between lessons