25. Practical Projects - Comprehensive Data Analysis Practice
25. Practical Projects - Comprehensive Data Analysis Practice¶
Overview¶
This is a comprehensive EDA practice using real datasets. It covers the entire analysis process from data loading to deriving insights.
ํ๋ก์ ํธ 1: Titanic ์์กด ๋ถ์¶
1.1 ๋ฐ์ดํฐ ๋ก๋ฉ ๋ฐ ๊ฐ์¶
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Seaborn ๋ด์ฅ ๋ฐ์ดํฐ์
๋ก๋
titanic = sns.load_dataset('titanic')
# ๋ฐ์ดํฐ ๊ฐ์
print("="*50)
print("๋ฐ์ดํฐ ๊ธฐ๋ณธ ์ ๋ณด")
print("="*50)
print(f"ํ ์: {len(titanic)}")
print(f"์ด ์: {len(titanic.columns)}")
print(f"\n์ปฌ๋ผ ๋ชฉ๋ก:\n{titanic.columns.tolist()}")
print(f"\n๋ฐ์ดํฐ ํ์
:\n{titanic.dtypes}")
# ์ฒ์ 5ํ
print("\n" + "="*50)
print("๋ฐ์ดํฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ")
print("="*50)
print(titanic.head())
1.2 ๊ฒฐ์ธก์น ๋ถ์¶
print("="*50)
print("๊ฒฐ์ธก์น ๋ถ์")
print("="*50)
# ๊ฒฐ์ธก์น ํํฉ
missing = titanic.isnull().sum()
missing_pct = (missing / len(titanic) * 100).round(2)
missing_df = pd.DataFrame({
'๊ฒฐ์ธก์น ์': missing,
'๊ฒฐ์ธก์น ๋น์จ(%)': missing_pct
}).sort_values('๊ฒฐ์ธก์น ๋น์จ(%)', ascending=False)
print(missing_df[missing_df['๊ฒฐ์ธก์น ์'] > 0])
# ์๊ฐํ
fig, ax = plt.subplots(figsize=(10, 6))
missing_cols = missing_df[missing_df['๊ฒฐ์ธก์น ์'] > 0].index
missing_vals = missing_df.loc[missing_cols, '๊ฒฐ์ธก์น ๋น์จ(%)']
ax.barh(missing_cols, missing_vals, color='coral')
ax.set_xlabel('๊ฒฐ์ธก์น ๋น์จ (%)')
ax.set_title('๊ฒฐ์ธก์น ํํฉ')
for i, v in enumerate(missing_vals):
ax.text(v + 0.5, i, f'{v}%', va='center')
plt.tight_layout()
plt.show()
1.3 ํ๊ฒ ๋ณ์ ๋ถ์¶
print("="*50)
print("ํ๊ฒ ๋ณ์ (์์กด ์ฌ๋ถ) ๋ถ์")
print("="*50)
# ์์กด์จ
survival_rate = titanic['survived'].value_counts(normalize=True)
print(f"์์กด์จ: {survival_rate[1]:.1%}")
print(f"์ฌ๋ง์จ: {survival_rate[0]:.1%}")
# ์๊ฐํ
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# ๋น๋
titanic['survived'].value_counts().plot(kind='bar', ax=axes[0],
color=['coral', 'steelblue'])
axes[0].set_title('์์กด ์ฌ๋ถ ๋น๋')
axes[0].set_xticklabels(['์ฌ๋ง', '์์กด'], rotation=0)
axes[0].set_ylabel('์ธ์ ์')
# ๋น์จ
titanic['survived'].value_counts().plot(kind='pie', ax=axes[1],
autopct='%1.1f%%',
colors=['coral', 'steelblue'],
labels=['์ฌ๋ง', '์์กด'])
axes[1].set_title('์์กด ์ฌ๋ถ ๋น์จ')
axes[1].set_ylabel('')
plt.tight_layout()
plt.show()
1.4 ๋ฒ์ฃผํ ๋ณ์๋ณ ์์กด์จ¶
print("="*50)
print("๋ฒ์ฃผํ ๋ณ์๋ณ ์์กด์จ")
print("="*50)
categorical_vars = ['sex', 'pclass', 'embarked', 'alone']
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
for ax, var in zip(axes.flat, categorical_vars):
survival_by_var = titanic.groupby(var)['survived'].mean().sort_values(ascending=False)
survival_by_var.plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
ax.set_title(f'{var}๋ณ ์์กด์จ')
ax.set_ylabel('์์กด์จ')
ax.set_ylim(0, 1)
ax.tick_params(axis='x', rotation=45)
# ๊ฐ ํ์
for i, v in enumerate(survival_by_var):
ax.text(i, v + 0.02, f'{v:.1%}', ha='center', va='bottom')
plt.tight_layout()
plt.show()
# ํต๊ณ ์์ฝ
print("\n์ฑ๋ณ ์์กด์จ:")
print(titanic.groupby('sex')['survived'].agg(['mean', 'count']))
print("\n๊ฐ์ค ๋ฑ๊ธ๋ณ ์์กด์จ:")
print(titanic.groupby('pclass')['survived'].agg(['mean', 'count']))
1.5 ์์นํ ๋ณ์ ๋ถ์¶
print("="*50)
print("์์นํ ๋ณ์ ๋ถ์")
print("="*50)
numeric_vars = ['age', 'fare']
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
for i, var in enumerate(numeric_vars):
# ํ์คํ ๊ทธ๋จ (์์กด ์ฌ๋ถ๋ณ)
for survived, label, color in [(0, '์ฌ๋ง', 'coral'), (1, '์์กด', 'steelblue')]:
data = titanic[titanic['survived'] == survived][var].dropna()
axes[i, 0].hist(data, bins=30, alpha=0.6, label=label, color=color)
axes[i, 0].set_title(f'{var} ๋ถํฌ (์์กด ์ฌ๋ถ๋ณ)')
axes[i, 0].set_xlabel(var)
axes[i, 0].legend()
# ๋ฐ์คํ๋กฏ
titanic.boxplot(column=var, by='survived', ax=axes[i, 1])
axes[i, 1].set_title(f'{var} (์์กด ์ฌ๋ถ๋ณ)')
axes[i, 1].set_xlabel('์์กด ์ฌ๋ถ')
plt.suptitle('')
plt.tight_layout()
plt.show()
# ํต๊ณ ์์ฝ
print("\n๋์ด๋ณ ์์กด ํต๊ณ:")
print(titanic.groupby('survived')['age'].describe())
1.6 ๋ค๋ณ๋ ๋ถ์¶
print("="*50)
print("๋ค๋ณ๋ ๋ถ์")
print("="*50)
# ์ฑ๋ณ & ๊ฐ์ค ๋ฑ๊ธ๋ณ ์์กด์จ
pivot = pd.pivot_table(titanic, values='survived',
index='pclass', columns='sex', aggfunc='mean')
print("์ฑ๋ณ & ๊ฐ์ค ๋ฑ๊ธ๋ณ ์์กด์จ:")
print(pivot)
# ํํธ๋งต
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(pivot, annot=True, cmap='RdYlGn', fmt='.1%',
vmin=0, vmax=1, ax=ax)
ax.set_title('์ฑ๋ณ & ๊ฐ์ค ๋ฑ๊ธ๋ณ ์์กด์จ')
plt.show()
# ๋์ด ๊ทธ๋ฃน ์์ฑ
titanic['age_group'] = pd.cut(titanic['age'],
bins=[0, 12, 18, 35, 60, 100],
labels=['์ด๋ฆฐ์ด', '์ฒญ์๋
', '์ฒญ๋
', '์ค๋
', '๋
ธ๋
'])
# ๋์ด ๊ทธ๋ฃน๋ณ ์์กด์จ
age_survival = titanic.groupby('age_group')['survived'].mean()
print("\n๋์ด ๊ทธ๋ฃน๋ณ ์์กด์จ:")
print(age_survival)
1.7 ํต๊ณ ๊ฒ์ ¶
from scipy import stats
print("="*50)
print("ํต๊ณ ๊ฒ์ ")
print("="*50)
# ์ฑ๋ณ์ ๋ฐ๋ฅธ ์์กด์จ ์ฐจ์ด (์นด์ด์ ๊ณฑ ๊ฒ์ )
contingency = pd.crosstab(titanic['sex'], titanic['survived'])
chi2, p_value, dof, expected = stats.chi2_contingency(contingency)
print(f"\n์ฑ๋ณ-์์กด ์นด์ด์ ๊ณฑ ๊ฒ์ :")
print(f"ฯยฒ = {chi2:.4f}, p-value = {p_value:.4f}")
# ์์กด ์ฌ๋ถ์ ๋ฐ๋ฅธ ๋์ด ์ฐจ์ด (t-๊ฒ์ )
survived_age = titanic[titanic['survived'] == 1]['age'].dropna()
died_age = titanic[titanic['survived'] == 0]['age'].dropna()
stat, p_value = stats.ttest_ind(survived_age, died_age)
print(f"\n๋์ด-์์กด t-๊ฒ์ :")
print(f"t = {stat:.4f}, p-value = {p_value:.4f}")
print(f"์์กด์ ํ๊ท ๋์ด: {survived_age.mean():.1f}")
print(f"์ฌ๋ง์ ํ๊ท ๋์ด: {died_age.mean():.1f}")
1.8 ์ธ์ฌ์ดํธ ์ ๋ฆฌ¶
print("="*50)
print("์ฃผ์ ์ธ์ฌ์ดํธ")
print("="*50)
insights = """
1. ์ ์ฒด ์์กด์จ: ์ฝ 38%
2. ์ฑ๋ณ:
- ์ฌ์ฑ ์์กด์จ(74%)์ด ๋จ์ฑ(19%)๋ณด๋ค ํ์ ํ ๋์
- "์ฌ์ฑ๊ณผ ์์ด ๋จผ์ " ์์น์ ์ํฅ
3. ๊ฐ์ค ๋ฑ๊ธ:
- 1๋ฑ์(63%) > 2๋ฑ์(47%) > 3๋ฑ์(24%)
- ์์ ๋ฑ๊ธ์ผ์๋ก ์์กด์จ ๋์
4. ๋์ด:
- ์ด๋ฆฐ์ด ์์กด์จ์ด ๊ฐ์ฅ ๋์
- ์์กด์ ํ๊ท ๋์ด๊ฐ ์ฌ๋ง์๋ณด๋ค ์ฝ๊ฐ ๋ฎ์
5. ๋๋ฐ์:
- ํผ์ ํ์นํ ์น๊ฐ์ ์์กด์จ์ด ๋ฎ์
6. ์ด์:
- ๋์ ์ด์์ ์ง๋ถํ ์น๊ฐ์ ์์กด์จ์ด ๋์
- (๊ฐ์ค ๋ฑ๊ธ๊ณผ ์๊ด๊ด๊ณ)
"""
print(insights)
ํ๋ก์ ํธ 2: ํ ๋ฐ์ดํฐ ๋ถ์¶
2.1 ๋ฐ์ดํฐ ํ์¶
tips = sns.load_dataset('tips')
print("="*50)
print("Tips ๋ฐ์ดํฐ์
๊ฐ์")
print("="*50)
print(tips.info())
print("\n๊ธฐ์ ํต๊ณ:")
print(tips.describe())
2.2 ํ ๊ธ์ก ๋ถ์¶
# ํ ๋น์จ ๊ณ์ฐ
tips['tip_pct'] = tips['tip'] / tips['total_bill'] * 100
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# ํ ๊ธ์ก ๋ถํฌ
axes[0, 0].hist(tips['tip'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(tips['tip'].mean(), color='red', linestyle='--',
label=f'ํ๊ท : ${tips["tip"].mean():.2f}')
axes[0, 0].set_title('ํ ๊ธ์ก ๋ถํฌ')
axes[0, 0].set_xlabel('ํ ($)')
axes[0, 0].legend()
# ํ ๋น์จ ๋ถํฌ
axes[0, 1].hist(tips['tip_pct'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 1].axvline(tips['tip_pct'].mean(), color='red', linestyle='--',
label=f'ํ๊ท : {tips["tip_pct"].mean():.1f}%')
axes[0, 1].set_title('ํ ๋น์จ ๋ถํฌ')
axes[0, 1].set_xlabel('ํ ๋น์จ (%)')
axes[0, 1].legend()
# ์์ผ๋ณ ํ
tips.groupby('day')['tip'].mean().plot(kind='bar', ax=axes[1, 0],
color='steelblue', edgecolor='black')
axes[1, 0].set_title('์์ผ๋ณ ํ๊ท ํ')
axes[1, 0].set_ylabel('ํ๊ท ํ ($)')
axes[1, 0].tick_params(axis='x', rotation=45)
# ์๊ฐ๋๋ณ ํ
tips.groupby('time')['tip'].mean().plot(kind='bar', ax=axes[1, 1],
color='coral', edgecolor='black')
axes[1, 1].set_title('์๊ฐ๋๋ณ ํ๊ท ํ')
axes[1, 1].set_ylabel('ํ๊ท ํ ($)')
axes[1, 1].tick_params(axis='x', rotation=0)
plt.tight_layout()
plt.show()
print(f"ํ๊ท ํ: ${tips['tip'].mean():.2f}")
print(f"ํ๊ท ํ ๋น์จ: {tips['tip_pct'].mean():.1f}%")
2.3 ์ฒญ๊ตฌ ๊ธ์ก๊ณผ ํ์ ๊ด๊ณ¶
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# ์ฐ์ ๋
sns.scatterplot(data=tips, x='total_bill', y='tip', hue='time',
size='size', ax=axes[0])
axes[0].set_title('์ฒญ๊ตฌ ๊ธ์ก vs ํ')
# ํ๊ท์
sns.regplot(data=tips, x='total_bill', y='tip', ax=axes[1],
scatter_kws={'alpha': 0.5})
axes[1].set_title('์ฒญ๊ตฌ ๊ธ์ก vs ํ (ํ๊ท์ )')
plt.tight_layout()
plt.show()
# ์๊ด๊ณ์
corr, p_value = stats.pearsonr(tips['total_bill'], tips['tip'])
print(f"์๊ด๊ณ์: {corr:.4f} (p-value: {p_value:.4f})")
2.4 ๊ทธ๋ฃน๋ณ ๋น๊ต¶
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# ์ฑ๋ณ
sns.boxplot(data=tips, x='sex', y='tip_pct', ax=axes[0, 0])
axes[0, 0].set_title('์ฑ๋ณ ํ ๋น์จ')
# ํก์ฐ ์ฌ๋ถ
sns.boxplot(data=tips, x='smoker', y='tip_pct', ax=axes[0, 1])
axes[0, 1].set_title('ํก์ฐ ์ฌ๋ถ๋ณ ํ ๋น์จ')
# ์์ผ
sns.boxplot(data=tips, x='day', y='tip_pct', ax=axes[1, 0])
axes[1, 0].set_title('์์ผ๋ณ ํ ๋น์จ')
# ์ธ์์
sns.boxplot(data=tips, x='size', y='tip_pct', ax=axes[1, 1])
axes[1, 1].set_title('์ธ์์๋ณ ํ ๋น์จ')
plt.tight_layout()
plt.show()
# ํต๊ณ ๊ฒ์ : ์ฑ๋ณ ์ฐจ์ด
male_tip = tips[tips['sex'] == 'Male']['tip_pct']
female_tip = tips[tips['sex'] == 'Female']['tip_pct']
stat, p_value = stats.ttest_ind(male_tip, female_tip)
print(f"\n์ฑ๋ณ ํ ๋น์จ t-๊ฒ์ : t={stat:.4f}, p={p_value:.4f}")
ํ๋ก์ ํธ 3: ๋ถ์ ๋ณด๊ณ ์ ํ ํ๋ฆฟ¶
def generate_eda_report(df, target=None):
"""
EDA ๋ณด๊ณ ์ ์๋ ์์ฑ ํจ์
Parameters:
-----------
df : DataFrame
๋ถ์ํ ๋ฐ์ดํฐํ๋ ์
target : str, optional
ํ๊ฒ ๋ณ์๋ช
"""
print("="*60)
print(" ํ์์ ๋ฐ์ดํฐ ๋ถ์ (EDA) ๋ณด๊ณ ์")
print("="*60)
# 1. ๊ธฐ๋ณธ ์ ๋ณด
print("\n1. ๋ฐ์ดํฐ ๊ธฐ๋ณธ ์ ๋ณด")
print("-"*40)
print(f" ํ ์: {len(df):,}")
print(f" ์ด ์: {len(df.columns)}")
print(f" ๋ฉ๋ชจ๋ฆฌ ์ฌ์ฉ๋: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
# 2. ๋ฐ์ดํฐ ํ์
print("\n2. ๋ฐ์ดํฐ ํ์
์์ฝ")
print("-"*40)
dtype_counts = df.dtypes.value_counts()
for dtype, count in dtype_counts.items():
print(f" {dtype}: {count}๊ฐ")
# 3. ๊ฒฐ์ธก์น
print("\n3. ๊ฒฐ์ธก์น ํํฉ")
print("-"*40)
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100)
for col, (cnt, pct) in zip(df.columns, zip(missing, missing_pct)):
if cnt > 0:
print(f" {col}: {cnt}๊ฐ ({pct:.1f}%)")
if missing.sum() == 0:
print(" ๊ฒฐ์ธก์น ์์")
# 4. ์์นํ ๋ณ์
print("\n4. ์์นํ ๋ณ์ ํต๊ณ")
print("-"*40)
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
for col in numeric_cols[:5]: # ์์ 5๊ฐ๋ง
print(f"\n [{col}]")
print(f" ํ๊ท : {df[col].mean():.2f}, ์ค์๊ฐ: {df[col].median():.2f}")
print(f" ํ์คํธ์ฐจ: {df[col].std():.2f}")
print(f" ๋ฒ์: [{df[col].min():.2f}, {df[col].max():.2f}]")
# 5. ๋ฒ์ฃผํ ๋ณ์
print("\n5. ๋ฒ์ฃผํ ๋ณ์ ์์ฝ")
print("-"*40)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols[:5]: # ์์ 5๊ฐ๋ง
print(f"\n [{col}]")
print(f" ๊ณ ์ ๊ฐ ์: {df[col].nunique()}")
print(f" ์ต๋น๊ฐ: {df[col].mode().values[0]}")
# 6. ํ๊ฒ ๋ณ์ (์๋ ๊ฒฝ์ฐ)
if target and target in df.columns:
print(f"\n6. ํ๊ฒ ๋ณ์ ({target}) ๋ถ์")
print("-"*40)
if df[target].dtype in ['int64', 'float64']:
print(f" ํ๊ท : {df[target].mean():.2f}")
print(f" ๋ถํฌ: ์ฐ์ํ")
else:
print(f" ํด๋์ค ๋ถํฌ:")
for val, cnt in df[target].value_counts().items():
print(f" - {val}: {cnt} ({cnt/len(df)*100:.1f}%)")
print("\n" + "="*60)
print(" ๋ณด๊ณ ์ ๋")
print("="*60)
# Usage ์์
# generate_eda_report(titanic, target='survived')
๋ถ์ ์ฒดํฌ๋ฆฌ์คํธ¶
## ๋ฐ์ดํฐ ๋ถ์ ์ฒดํฌ๋ฆฌ์คํธ
### 1๋จ๊ณ: ๋ฐ์ดํฐ ์ดํด
- [ ] ๋ฐ์ดํฐ ์ถ์ฒ์ ์์ง ๋ฐฉ๋ฒ ํ์ธ
- [ ] ๊ฐ ๋ณ์์ ์๋ฏธ ํ์
- [ ] ๋น์ฆ๋์ค ๋ฌธ๋งฅ ์ดํด
### 2๋จ๊ณ: ๋ฐ์ดํฐ ํ์ง ํ์ธ
- [ ] ๊ฒฐ์ธก์น ํ์ธ ๋ฐ ์ฒ๋ฆฌ ๊ณํ
- [ ] ์ด์์น ํ์ง
- [ ] ๋ฐ์ดํฐ ํ์
ํ์ธ
- [ ] ์ค๋ณต ๋ฐ์ดํฐ ํ์ธ
### 3๋จ๊ณ: ๋จ๋ณ๋ ๋ถ์
- [ ] ์์นํ ๋ณ์ ๋ถํฌ ํ์ธ
- [ ] ๋ฒ์ฃผํ ๋ณ์ ๋น๋ ํ์ธ
- [ ] ๊ธฐ์ ํต๊ณ๋ ๊ณ์ฐ
### 4๋จ๊ณ: ์ด๋ณ๋/๋ค๋ณ๋ ๋ถ์
- [ ] ๋ณ์ ๊ฐ ์๊ด๊ด๊ณ ๋ถ์
- [ ] ๊ทธ๋ฃน๋ณ ๋น๊ต ๋ถ์
- [ ] ํ๊ฒ ๋ณ์์์ ๊ด๊ณ ๋ถ์
### 5๋จ๊ณ: ํต๊ณ ๊ฒ์
- [ ] ์ ์ ํ ๊ฒ์ ๋ฐฉ๋ฒ ์ ํ
- [ ] ๊ฐ์ ๊ฒ์ฆ
- [ ] ๊ฒฐ๊ณผ ํด์
### 6๋จ๊ณ: ์ธ์ฌ์ดํธ ๋์ถ
- [ ] ์ฃผ์ ๋ฐ๊ฒฌ ์ ๋ฆฌ
- [ ] ๋น์ฆ๋์ค ์๋ฏธ ํด์
- [ ] ์ถ๊ฐ ๋ถ์ ์ ์
Summary¶
| ๋จ๊ณ | ์ฃผ์ ์์ | ๋๊ตฌ/ํจ์ |
|---|---|---|
| ๋ฐ์ดํฐ ๋ก๋ฉ | CSV/Excel/DB ๋ก๋ | pd.read_*() |
| ๊ฐ์ ํ์ | ํํ, ํ์ ํ์ธ | info(), describe() |
| ๊ฒฐ์ธก์น | ํ์ธ ๋ฐ ์ฒ๋ฆฌ | isna(), fillna() |
| ๋จ๋ณ๋ | ๋ถํฌ, ๋น๋ ๋ถ์ | histplot(), countplot() |
| ์ด๋ณ๋ | ๊ด๊ณ ๋ถ์ | scatterplot(), boxplot() |
| ๋ค๋ณ๋ | ํจํด ๋ฐ๊ฒฌ | heatmap(), pairplot() |
| ํต๊ณ ๊ฒ์ | ์ ์์ฑ ๊ฒ์ | scipy.stats |
| ์ธ์ฌ์ดํธ | ๊ฒฐ๊ณผ ์ ๋ฆฌ | ๋งํฌ๋ค์ด ๋ณด๊ณ ์ |