15. ๋ถ์ฐ๋ถ์ (ANOVA - Analysis of Variance)
15. ๋ถ์ฐ๋ถ์ (ANOVA - Analysis of Variance)¶
์ด์ : ๊ฐ์ค๊ฒ์ ์ฌํ | ๋ค์: ํ๊ท๋ถ์ ์ฌํ
๊ฐ์¶
๋ถ์ฐ๋ถ์(ANOVA)์ ์ธ ๊ฐ ์ด์์ ๊ทธ๋ฃน ํ๊ท ์ ๋์์ ๋น๊ตํ๋ ํต๊ณ ๊ธฐ๋ฒ์ ๋๋ค. ์ฌ๋ฌ t-๊ฒ์ ์ ๋ฐ๋ณตํ๋ ๊ฒ๋ณด๋ค ์ 1์ข ์ค๋ฅ๋ฅผ ํต์ ํ๋ฉด์ ํจ์จ์ ์ผ๋ก ๋ถ์ํ ์ ์์ต๋๋ค.
1. ์ผ์๋ฐฐ์น ๋ถ์ฐ๋ถ์ (One-way ANOVA)¶
1.1 ๊ธฐ๋ณธ ๊ฐ๋ ¶
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import pingouin as pg
import statsmodels.api as sm
from statsmodels.formula.api import ols
# ANOVA์ ๊ธฐ๋ณธ ์์ด๋์ด:
# ์ด ๋ณ๋ = ๊ทธ๋ฃน ๊ฐ ๋ณ๋ + ๊ทธ๋ฃน ๋ด ๋ณ๋
# Hโ: ฮผโ = ฮผโ = ... = ฮผโ (๋ชจ๋ ๊ทธ๋ฃน ํ๊ท ์ด ๊ฐ๋ค)
# Hโ: ์ ์ด๋ ํ๋์ ๊ทธ๋ฃน ํ๊ท ์ด ๋ค๋ฅด๋ค
# ์์ ๋ฐ์ดํฐ ์์ฑ
np.random.seed(42)
# ์ธ ๊ฐ์ง ๊ต์๋ฒ์ ํจ๊ณผ ๋น๊ต
method_A = np.random.normal(75, 10, 25) # ์ ํต์ ๊ต์๋ฒ
method_B = np.random.normal(80, 10, 25) # ํ ๋ก ์ ๊ต์๋ฒ
method_C = np.random.normal(85, 10, 25) # ํ๋ฆฝ๋ฌ๋
# DataFrame ์์ฑ
df = pd.DataFrame({
'score': np.concatenate([method_A, method_B, method_C]),
'method': ['A']*25 + ['B']*25 + ['C']*25
})
# ์๊ฐํ
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Box plot
sns.boxplot(data=df, x='method', y='score', ax=axes[0], palette='Set2')
axes[0].set_xlabel('๊ต์๋ฒ')
axes[0].set_ylabel('์ ์')
axes[0].set_title('๊ต์๋ฒ๋ณ ์ ์ ๋ถํฌ')
# ๊ฐ ๊ทธ๋ฃน์ ํ๊ท ๊ณผ ๊ฐ๋ณ ์ ์
for i, method in enumerate(['A', 'B', 'C']):
data = df[df['method'] == method]['score']
axes[1].scatter([i]*len(data), data, alpha=0.5, s=30)
axes[1].scatter([i], [data.mean()], color='red', s=100, marker='D', zorder=5)
axes[1].set_xticks([0, 1, 2])
axes[1].set_xticklabels(['A', 'B', 'C'])
axes[1].set_xlabel('๊ต์๋ฒ')
axes[1].set_ylabel('์ ์')
axes[1].set_title('๊ฐ๋ณ ์ ์์ ๊ทธ๋ฃน ํ๊ท ')
axes[1].axhline(df['score'].mean(), color='blue', linestyle='--', label='์ ์ฒด ํ๊ท ')
axes[1].legend()
plt.tight_layout()
plt.show()
print(f"์ ์ฒด ํ๊ท : {df['score'].mean():.2f}")
print(f"๊ทธ๋ฃน๋ณ ํ๊ท :")
print(df.groupby('method')['score'].mean())
1.2 ANOVA ๊ณ์ฐ¶
def one_way_anova_manual(groups):
"""์ผ์๋ฐฐ์น ANOVA ์๋ ๊ณ์ฐ"""
# ์ ์ฒด ๋ฐ์ดํฐ
all_data = np.concatenate(groups)
n_total = len(all_data)
k = len(groups) # ๊ทธ๋ฃน ์
grand_mean = all_data.mean()
# ๊ทธ๋ฃน ๊ฐ ๋ณ๋ (SSB: Sum of Squares Between)
SSB = sum(len(g) * (g.mean() - grand_mean)**2 for g in groups)
# ๊ทธ๋ฃน ๋ด ๋ณ๋ (SSW: Sum of Squares Within)
SSW = sum(sum((x - g.mean())**2 for x in g) for g in groups)
# ์ด ๋ณ๋ (SST: Sum of Squares Total)
SST = sum((x - grand_mean)**2 for x in all_data)
# ์์ ๋
df_between = k - 1
df_within = n_total - k
df_total = n_total - 1
# ํ๊ท ์ ๊ณฑ (Mean Squares)
MSB = SSB / df_between
MSW = SSW / df_within
# F-ํต๊ณ๋
F_stat = MSB / MSW
# p-value
p_value = 1 - stats.f.cdf(F_stat, df_between, df_within)
return {
'SSB': SSB, 'SSW': SSW, 'SST': SST,
'df_between': df_between, 'df_within': df_within,
'MSB': MSB, 'MSW': MSW,
'F_stat': F_stat, 'p_value': p_value
}
# ๊ณ์ฐ
groups = [method_A, method_B, method_C]
result = one_way_anova_manual(groups)
print("์ผ์๋ฐฐ์น ANOVA ๊ฒฐ๊ณผ (์๋ ๊ณ์ฐ):")
print("=" * 60)
print("๋ถ์ฐ๋ถ์ํ:")
print("-" * 60)
print(f"{'๋ณ๋์':<12} {'SS':<12} {'df':<8} {'MS':<12} {'F':<10} {'p-value':<10}")
print("-" * 60)
print(f"{'์ฒ๋ฆฌ(๊ทธ๋ฃน๊ฐ)':<12} {result['SSB']:<12.2f} {result['df_between']:<8} "
f"{result['MSB']:<12.2f} {result['F_stat']:<10.3f} {result['p_value']:<10.4f}")
print(f"{'์ค์ฐจ(๊ทธ๋ฃน๋ด)':<12} {result['SSW']:<12.2f} {result['df_within']:<8} {result['MSW']:<12.2f}")
print(f"{'์ด๊ณ':<12} {result['SST']:<12.2f} {result['df_between'] + result['df_within']:<8}")
print("=" * 60)
1.3 scipy์ statsmodels ํ์ฉ¶
# scipy.stats.f_oneway
F_scipy, p_scipy = stats.f_oneway(method_A, method_B, method_C)
print(f"scipy.stats.f_oneway:")
print(f" F = {F_scipy:.3f}, p = {p_scipy:.4f}")
# statsmodels OLS๋ฅผ ์ด์ฉํ ANOVA
model = ols('score ~ C(method)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(f"\nstatsmodels ANOVA Table:")
print(anova_table)
# pingouin
anova_pg = pg.anova(data=df, dv='score', between='method', detailed=True)
print(f"\npingouin ANOVA:")
print(anova_pg)
1.4 F-๋ถํฌ ์ดํด¶
# F-๋ถํฌ: ๋ ์นด์ด์ ๊ณฑ ๋ณ์์ ๋น์จ
# F = (ฯยฒโ/dfโ) / (ฯยฒโ/dfโ)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# F-๋ถํฌ (๋ค์ํ ์์ ๋)
x = np.linspace(0, 5, 200)
df_pairs = [(2, 20), (5, 20), (10, 20), (5, 50)]
for df1, df2 in df_pairs:
axes[0].plot(x, stats.f.pdf(x, df1, df2), label=f'F({df1},{df2})')
axes[0].set_xlabel('F')
axes[0].set_ylabel('๋ฐ๋')
axes[0].set_title('F-๋ถํฌ (๋ค์ํ ์์ ๋)')
axes[0].legend()
axes[0].grid(alpha=0.3)
# ํ์ฌ ๋ถ์์ F-๋ถํฌ์ ๊ด์ฐฐ๋ F-๊ฐ
df1, df2 = result['df_between'], result['df_within']
x = np.linspace(0, 8, 200)
y = stats.f.pdf(x, df1, df2)
axes[1].plot(x, y, 'b-', lw=2, label=f'F({df1},{df2})')
axes[1].fill_between(x, y, where=(x >= result['F_stat']), alpha=0.3, color='red',
label=f'p-value = {result["p_value"]:.4f}')
axes[1].axvline(result['F_stat'], color='red', linestyle='--',
label=f'F = {result["F_stat"]:.2f}')
# ์๊ณ๊ฐ
f_critical = stats.f.ppf(0.95, df1, df2)
axes[1].axvline(f_critical, color='green', linestyle=':',
label=f'F_crit (ฮฑ=0.05) = {f_critical:.2f}')
axes[1].set_xlabel('F')
axes[1].set_ylabel('๋ฐ๋')
axes[1].set_title('F-๋ถํฌ์ ๊ฒ์ ๊ฒฐ๊ณผ')
axes[1].legend()
axes[1].grid(alpha=0.3)
plt.tight_layout()
plt.show()
2. ANOVA ๊ฐ์ ํ์ธ¶
2.1 ์ ๊ท์ฑ ๊ฒ์ ¶
# ๊ฐ ๊ทธ๋ฃน์ ์ ๊ท์ฑ ๊ฒ์
print("์ ๊ท์ฑ ๊ฒ์ (Shapiro-Wilk):")
print("-" * 40)
for method in ['A', 'B', 'C']:
data = df[df['method'] == method]['score']
stat, p = stats.shapiro(data)
print(f"๊ทธ๋ฃน {method}: W = {stat:.4f}, p = {p:.4f}")
# ์์ฐจ์ ์ ๊ท์ฑ (๋ ์ค์)
residuals = model.resid
stat, p = stats.shapiro(residuals)
print(f"\n์์ฐจ ์ ๊ท์ฑ: W = {stat:.4f}, p = {p:.4f}")
# Q-Q plot
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# ๊ทธ๋ฃน๋ณ Q-Q plot
for i, method in enumerate(['A', 'B', 'C']):
data = df[df['method'] == method]['score']
stats.probplot(data, dist="norm", plot=axes[0])
axes[0].set_title('๊ทธ๋ฃน๋ณ Q-Q Plot')
# ์์ฐจ Q-Q plot
stats.probplot(residuals, dist="norm", plot=axes[1])
axes[1].set_title('์์ฐจ Q-Q Plot')
plt.tight_layout()
plt.show()
2.2 ๋ฑ๋ถ์ฐ์ฑ ๊ฒ์ ¶
# Levene's test (์ค์๊ฐ ๊ธฐ๋ฐ, ๋ ๋ก๋ฒ์คํธ)
stat_levene, p_levene = stats.levene(method_A, method_B, method_C)
print(f"Levene's Test: F = {stat_levene:.4f}, p = {p_levene:.4f}")
# Bartlett's test (์ ๊ท์ฑ ๊ฐ์ )
stat_bartlett, p_bartlett = stats.bartlett(method_A, method_B, method_C)
print(f"Bartlett's Test: ฯยฒ = {stat_bartlett:.4f}, p = {p_bartlett:.4f}")
# pingouin
homoscedasticity_result = pg.homoscedasticity(df, dv='score', group='method')
print(f"\npingouin ๋ฑ๋ถ์ฐ์ฑ ๊ฒ์ :")
print(homoscedasticity_result)
# ์์ฐจ vs ์ ํฉ๊ฐ ๊ทธ๋ฆผ
fig, ax = plt.subplots(figsize=(10, 5))
fitted = model.fittedvalues
residuals = model.resid
ax.scatter(fitted, residuals, alpha=0.6)
ax.axhline(0, color='red', linestyle='--')
ax.set_xlabel('์ ํฉ๊ฐ')
ax.set_ylabel('์์ฐจ')
ax.set_title('์์ฐจ vs ์ ํฉ๊ฐ (๋ฑ๋ถ์ฐ์ฑ ํ์ธ)')
# ๊ฐ ๊ทธ๋ฃน์ ๋ถ์ฐ ํ์
for method in ['A', 'B', 'C']:
data = df[df['method'] == method]
mean_val = data['score'].mean()
ax.axvline(mean_val, color='gray', linestyle=':', alpha=0.5)
plt.show()
print("\n๊ทธ๋ฃน๋ณ ํ์คํธ์ฐจ:")
print(df.groupby('method')['score'].std())
2.3 ๋ฑ๋ถ์ฐ ๊ฐ์ ์๋ฐ ์: Welch's ANOVA¶
# ๋ฑ๋ถ์ฐ ๊ฐ์ ์ด ์๋ฐ๋ ๊ฒฝ์ฐ
np.random.seed(42)
# ๋ถ์ฐ์ด ๋ค๋ฅธ ๊ทธ๋ฃน๋ค
group1 = np.random.normal(50, 5, 30) # ฯ = 5
group2 = np.random.normal(55, 15, 30) # ฯ = 15
group3 = np.random.normal(60, 25, 30) # ฯ = 25
df_hetero = pd.DataFrame({
'value': np.concatenate([group1, group2, group3]),
'group': ['G1']*30 + ['G2']*30 + ['G3']*30
})
# ๋ฑ๋ถ์ฐ์ฑ ๊ฒ์
stat, p = stats.levene(group1, group2, group3)
print(f"Levene's Test: p = {p:.4f} (๋ฑ๋ถ์ฐ ๊ฐ์ ์๋ฐ)")
# ์ผ๋ฐ ANOVA
F_normal, p_normal = stats.f_oneway(group1, group2, group3)
print(f"\n์ผ๋ฐ ANOVA: F = {F_normal:.3f}, p = {p_normal:.4f}")
# Welch's ANOVA (pingouin)
welch_result = pg.welch_anova(data=df_hetero, dv='value', between='group')
print(f"\nWelch's ANOVA:")
print(welch_result)
# Games-Howell ์ฌํ๊ฒ์ (๋ฑ๋ถ์ฐ ๊ฐ์ X)
games_howell = pg.pairwise_gameshowell(data=df_hetero, dv='value', between='group')
print(f"\nGames-Howell ์ฌํ๊ฒ์ :")
print(games_howell)
3. ์ฌํ๊ฒ์ (Post-hoc Tests)¶
3.1 Tukey HSD¶
# ANOVA๊ฐ ์ ์ํ๋ฉด โ ์ด๋ค ๊ทธ๋ฃน์ด ๋ค๋ฅธ์ง ํ์ธ
# Tukey's Honest Significant Difference
from statsmodels.stats.multicomp import pairwise_tukeyhsd
# ์๋ณธ ๋ฐ์ดํฐ๋ก
tukey_result = pairwise_tukeyhsd(df['score'], df['method'], alpha=0.05)
print("Tukey HSD ์ฌํ๊ฒ์ :")
print(tukey_result)
# ์๊ฐํ
fig = tukey_result.plot_simultaneous(figsize=(10, 4))
plt.title('Tukey HSD: 95% ์ ๋ขฐ๊ตฌ๊ฐ')
plt.xlabel('์ ์')
plt.tight_layout()
plt.show()
# pingouin ์ฌ์ฉ
tukey_pg = pg.pairwise_tukey(data=df, dv='score', between='method')
print("\npingouin Tukey HSD:")
print(tukey_pg)
3.2 ๋ค์ํ ์ฌํ๊ฒ์ ๋ฐฉ๋ฒ¶
# ๋ค์ํ ์ฌํ๊ฒ์ ๋น๊ต
print("๋ค์ํ ์ฌํ๊ฒ์ ๋ฐฉ๋ฒ ๋น๊ต:")
print("=" * 70)
# 1. Bonferroni
bonf = pg.pairwise_tests(data=df, dv='score', between='method',
padjust='bonf', effsize='cohen')
print("\n1. Bonferroni:")
print(bonf[['A', 'B', 'T', 'p-unc', 'p-corr', 'cohen']])
# 2. Holm
holm = pg.pairwise_tests(data=df, dv='score', between='method',
padjust='holm', effsize='cohen')
print("\n2. Holm:")
print(holm[['A', 'B', 'p-unc', 'p-corr']])
# 3. Scheffe (๋ ๋ณด์์ )
# statsmodels์์ ์ง์ ๊ณ์ฐ ํ์
# 4. Dunnett (๋์กฐ๊ตฐ ๋น๊ต)
# A๋ฅผ ๋์กฐ๊ตฐ์ผ๋ก ์ค์
from scipy.stats import dunnett
dunnett_result = dunnett(method_B, method_C, control=method_A)
print("\n4. Dunnett (A๊ฐ ๋์กฐ๊ตฐ):")
print(f" B vs A: statistic={dunnett_result.statistic[0]:.3f}, p={dunnett_result.pvalue[0]:.4f}")
print(f" C vs A: statistic={dunnett_result.statistic[1]:.3f}, p={dunnett_result.pvalue[1]:.4f}")
3.3 ํจ๊ณผํฌ๊ธฐ¶
# ANOVA ํจ๊ณผํฌ๊ธฐ
# Eta-squared (ฮทยฒ)
ss_between = result['SSB']
ss_total = result['SST']
eta_squared = ss_between / ss_total
# Omega-squared (ฯยฒ) - ํธํฅ ๋ณด์
ms_within = result['MSW']
n_total = len(df)
k = 3
omega_squared = (ss_between - (k-1)*ms_within) / (ss_total + ms_within)
# Partial eta-squared (์ด ๊ฒฝ์ฐ eta-squared์ ๋์ผ)
partial_eta_squared = ss_between / (ss_between + result['SSW'])
print("ANOVA ํจ๊ณผํฌ๊ธฐ:")
print("-" * 40)
print(f"ฮทยฒ (Eta-squared): {eta_squared:.4f}")
print(f"ฯยฒ (Omega-squared): {omega_squared:.4f}")
print(f"Partial ฮทยฒ: {partial_eta_squared:.4f}")
print("\nํด์ ๊ธฐ์ค (Cohen):")
print(" ฮทยฒ โ 0.01: ์์ ํจ๊ณผ")
print(" ฮทยฒ โ 0.06: ์ค๊ฐ ํจ๊ณผ")
print(" ฮทยฒ โ 0.14: ํฐ ํจ๊ณผ")
# pingouin ๊ฒฐ๊ณผ์์ ํจ๊ณผํฌ๊ธฐ
print(f"\npingouin ๊ฒฐ๊ณผ์ ฮทยฒ: {anova_pg['np2'].values[0]:.4f}")
4. ์ด์๋ฐฐ์น ๋ถ์ฐ๋ถ์ (Two-way ANOVA)¶
4.1 ๊ธฐ๋ณธ ๊ฐ๋ ¶
# ๋ ๊ฐ์ง ๋
๋ฆฝ๋ณ์(์์ธ)์ ํจ๊ณผ๋ฅผ ๋์์ ๋ถ์
# ์ฃผํจ๊ณผ(Main Effect) + ์ํธ์์ฉ(Interaction)
np.random.seed(42)
# ์์: ๊ต์๋ฒ(A, B) ร ํ์ต์๊ฐ(Low, High)์ ํจ๊ณผ
n_per_cell = 20
data = {
'score': [],
'method': [],
'time': []
}
# 2x2 ์ค๊ณ
# ์ํธ์์ฉ ํจ๊ณผ ํฌํจ
effects = {
('A', 'Low'): 70,
('A', 'High'): 80,
('B', 'Low'): 75,
('B', 'High'): 90, # B + High์ ์๋์ง ํจ๊ณผ
}
for (method, time), mean in effects.items():
scores = np.random.normal(mean, 8, n_per_cell)
data['score'].extend(scores)
data['method'].extend([method] * n_per_cell)
data['time'].extend([time] * n_per_cell)
df_two = pd.DataFrame(data)
# ์
ํ๊ท ํ์ธ
cell_means = df_two.groupby(['method', 'time'])['score'].mean().unstack()
print("์
ํ๊ท :")
print(cell_means)
# ์๊ฐํ
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Box plot
sns.boxplot(data=df_two, x='method', y='score', hue='time', ax=axes[0])
axes[0].set_title('๊ต์๋ฒ ร ํ์ต์๊ฐ')
axes[0].legend(title='ํ์ต์๊ฐ')
# ์ํธ์์ฉ ํ๋กฏ
for time in ['Low', 'High']:
means = df_two[df_two['time'] == time].groupby('method')['score'].mean()
axes[1].plot(['A', 'B'], means.values, 'o-', label=time, markersize=10)
axes[1].set_xlabel('๊ต์๋ฒ')
axes[1].set_ylabel('ํ๊ท ์ ์')
axes[1].set_title('์ํธ์์ฉ ํ๋กฏ')
axes[1].legend(title='ํ์ต์๊ฐ')
axes[1].grid(alpha=0.3)
plt.tight_layout()
plt.show()
4.2 ์ด์ ANOVA ๋ถ์¶
# statsmodels๋ฅผ ์ด์ฉํ ์ด์ ANOVA
model_two = ols('score ~ C(method) * C(time)', data=df_two).fit()
anova_two = sm.stats.anova_lm(model_two, typ=2)
print("์ด์๋ฐฐ์น ANOVA ๊ฒฐ๊ณผ:")
print(anova_two)
# pingouin
anova_two_pg = pg.anova(data=df_two, dv='score', between=['method', 'time'])
print("\npingouin ์ด์ ANOVA:")
print(anova_two_pg)
# ํจ๊ณผ ํด์
print("\nํจ๊ณผ ํด์:")
print("-" * 50)
for idx, row in anova_two.iterrows():
if 'Residual' not in idx:
sig = "***" if row['PR(>F)'] < 0.001 else ("**" if row['PR(>F)'] < 0.01 else ("*" if row['PR(>F)'] < 0.05 else ""))
print(f"{idx}: F = {row['F']:.2f}, p = {row['PR(>F)']:.4f} {sig}")
4.3 ์ํธ์์ฉ ํจ๊ณผ ํด์¶
# ์ํธ์์ฉ์ด ์ ์ํ ๋: ๋จ์ ์ฃผํจ๊ณผ ๋ถ์
# ํ์ต์๊ฐ๋ณ ๊ต์๋ฒ ํจ๊ณผ
print("๋จ์ ์ฃผํจ๊ณผ ๋ถ์: ํ์ต์๊ฐ๋ณ ๊ต์๋ฒ ํจ๊ณผ")
print("-" * 50)
for time in ['Low', 'High']:
subset = df_two[df_two['time'] == time]
t_stat, p_val = stats.ttest_ind(
subset[subset['method'] == 'A']['score'],
subset[subset['method'] == 'B']['score']
)
print(f"{time} ํ์ต์๊ฐ: t = {t_stat:.3f}, p = {p_val:.4f}")
# ๊ต์๋ฒ๋ณ ํ์ต์๊ฐ ํจ๊ณผ
print("\n๋จ์ ์ฃผํจ๊ณผ ๋ถ์: ๊ต์๋ฒ๋ณ ํ์ต์๊ฐ ํจ๊ณผ")
print("-" * 50)
for method in ['A', 'B']:
subset = df_two[df_two['method'] == method]
t_stat, p_val = stats.ttest_ind(
subset[subset['time'] == 'Low']['score'],
subset[subset['time'] == 'High']['score']
)
d = pg.compute_effsize(
subset[subset['time'] == 'High']['score'],
subset[subset['time'] == 'Low']['score'],
eftype='cohen'
)
print(f"๊ต์๋ฒ {method}: t = {t_stat:.3f}, p = {p_val:.4f}, d = {d:.3f}")
4.4 ์ํธ์์ฉ ์๋ ๊ฒฝ์ฐ¶
# ์ํธ์์ฉ์ด ์๋ ๋ฐ์ดํฐ ์์ฑ
np.random.seed(42)
data_no_int = {
'score': [],
'method': [],
'time': []
}
# ์ํธ์์ฉ ์์: ๊ฐ ์์ธ์ ํจ๊ณผ๊ฐ ๋
๋ฆฝ์
effects_no_int = {
('A', 'Low'): 70,
('A', 'High'): 80, # +10 (์๊ฐ ํจ๊ณผ)
('B', 'Low'): 78, # +8 (๋ฐฉ๋ฒ ํจ๊ณผ)
('B', 'High'): 88, # +8 + 10 (๊ฐ์ฐ์ )
}
for (method, time), mean in effects_no_int.items():
scores = np.random.normal(mean, 8, 20)
data_no_int['score'].extend(scores)
data_no_int['method'].extend([method] * 20)
data_no_int['time'].extend([time] * 20)
df_no_int = pd.DataFrame(data_no_int)
# ์ํธ์์ฉ ํ๋กฏ
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# ์ํธ์์ฉ ์๋ ๊ฒฝ์ฐ
for time in ['Low', 'High']:
means = df_two[df_two['time'] == time].groupby('method')['score'].mean()
axes[0].plot(['A', 'B'], means.values, 'o-', label=time, markersize=10)
axes[0].set_xlabel('๊ต์๋ฒ')
axes[0].set_ylabel('ํ๊ท ์ ์')
axes[0].set_title('์ํธ์์ฉ ์์ (์ ์ด ๊ต์ฐจ/๋นํํ)')
axes[0].legend(title='ํ์ต์๊ฐ')
axes[0].grid(alpha=0.3)
# ์ํธ์์ฉ ์๋ ๊ฒฝ์ฐ
for time in ['Low', 'High']:
means = df_no_int[df_no_int['time'] == time].groupby('method')['score'].mean()
axes[1].plot(['A', 'B'], means.values, 'o-', label=time, markersize=10)
axes[1].set_xlabel('๊ต์๋ฒ')
axes[1].set_ylabel('ํ๊ท ์ ์')
axes[1].set_title('์ํธ์์ฉ ์์ (์ ์ด ํํ)')
axes[1].legend(title='ํ์ต์๊ฐ')
axes[1].grid(alpha=0.3)
plt.tight_layout()
plt.show()
# ANOVA ๋น๊ต
print("์ํธ์์ฉ ์๋ ๊ฒฝ์ฐ ANOVA:")
model_no_int = ols('score ~ C(method) * C(time)', data=df_no_int).fit()
print(sm.stats.anova_lm(model_no_int, typ=2))
5. ๋ฐ๋ณต์ธก์ ๋ถ์ฐ๋ถ์ (Repeated Measures ANOVA)¶
5.1 ๊ธฐ๋ณธ ๊ฐ๋ ¶
# ๊ฐ์ ํผํ์๋ฅผ ์ฌ๋ฌ ์กฐ๊ฑด์์ ์ธก์
# ํผํ์ ๋ด ์ค๊ณ (within-subjects design)
np.random.seed(42)
# ์์: ๋์ผ ํผํ์๊ฐ 3๊ฐ์ง ์ฝ๋ฌผ ์กฐ๊ฑด์์ ์ธก์
n_subjects = 20
# ๊ฐ์ธ์ฐจ (ํผํ์ ํจ๊ณผ)
subject_effect = np.random.normal(0, 10, n_subjects)
# ๊ฐ ์กฐ๊ฑด์ ํ๊ท ํจ๊ณผ
condition_effects = {'Placebo': 0, 'Drug_A': 5, 'Drug_B': 10}
data_rm = []
for subj in range(n_subjects):
for condition, effect in condition_effects.items():
score = 50 + subject_effect[subj] + effect + np.random.normal(0, 5)
data_rm.append({
'subject': f'S{subj+1:02d}',
'condition': condition,
'score': score
})
df_rm = pd.DataFrame(data_rm)
# ์๊ฐํ
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Box plot
sns.boxplot(data=df_rm, x='condition', y='score', ax=axes[0],
order=['Placebo', 'Drug_A', 'Drug_B'])
axes[0].set_title('์กฐ๊ฑด๋ณ ์ ์ ๋ถํฌ')
# ๊ฐ์ธ๋ณ ๋ณํ
for subj in df_rm['subject'].unique()[:10]: # ์ฒ์ 10๋ช
๋ง
subj_data = df_rm[df_rm['subject'] == subj]
subj_data = subj_data.set_index('condition').loc[['Placebo', 'Drug_A', 'Drug_B']]
axes[1].plot(range(3), subj_data['score'].values, 'o-', alpha=0.5)
# ํ๊ท ์ถ๊ฐ
means = df_rm.groupby('condition')['score'].mean()
means = means.loc[['Placebo', 'Drug_A', 'Drug_B']]
axes[1].plot(range(3), means.values, 'rs-', markersize=12, linewidth=3, label='ํ๊ท ')
axes[1].set_xticks(range(3))
axes[1].set_xticklabels(['Placebo', 'Drug_A', 'Drug_B'])
axes[1].set_xlabel('์กฐ๊ฑด')
axes[1].set_ylabel('์ ์')
axes[1].set_title('๊ฐ์ธ๋ณ ๋ณํ ํจํด')
axes[1].legend()
plt.tight_layout()
plt.show()
5.2 ๋ฐ๋ณต์ธก์ ANOVA ๋ถ์¶
# pingouin์ ์ด์ฉํ ๋ฐ๋ณต์ธก์ ANOVA
rm_anova = pg.rm_anova(data=df_rm, dv='score', within='condition',
subject='subject', detailed=True)
print("๋ฐ๋ณต์ธก์ ANOVA ๊ฒฐ๊ณผ:")
print(rm_anova)
# ๊ตฌํ์ฑ ๊ฒ์ (Mauchly's test)
# ๋ฐ๋ณต์ธก์ ANOVA์ ๊ฐ์
print("\n๊ตฌํ์ฑ ๊ฐ์ :")
print(" ๊ตฌํ์ฑ์ด ์๋ฐ๋๋ฉด Greenhouse-Geisser ๋๋ Huynh-Feldt ๋ณด์ ์ฌ์ฉ")
print(f" GG epsilon: {rm_anova['eps'].values[0]:.4f}")
# ์ฌํ๊ฒ์
posthoc_rm = pg.pairwise_tests(data=df_rm, dv='score', within='condition',
subject='subject', padjust='bonf')
print("\n์ฌํ๊ฒ์ (Bonferroni):")
print(posthoc_rm[['A', 'B', 'T', 'p-unc', 'p-corr', 'BF10']])
5.3 ํผํฉ ์ค๊ณ ANOVA¶
# ํผํฉ ์ค๊ณ: ํผํ์ ๊ฐ ์์ธ + ํผํ์ ๋ด ์์ธ
np.random.seed(42)
# 2 (๊ทธ๋ฃน: ์คํ/ํต์ ) ร 3 (์์ : ์ /์ค/ํ) ํผํฉ ์ค๊ณ
n_per_group = 15
data_mixed = []
for group, group_effect in [('Experimental', 5), ('Control', 0)]:
for subj in range(n_per_group):
subject_id = f'{group[0]}_{subj+1:02d}'
base = 50 + np.random.normal(0, 8)
for time_idx, (time, time_effect) in enumerate([('Pre', 0), ('Mid', 3), ('Post', 6)]):
# ์ํธ์์ฉ: ์คํ๊ตฐ์ ์๊ฐ์ ๋ฐ๋ฅธ ํจ๊ณผ๊ฐ ๋ ํผ
interaction = time_idx * 3 if group == 'Experimental' else 0
score = base + group_effect + time_effect + interaction + np.random.normal(0, 5)
data_mixed.append({
'subject': subject_id,
'group': group,
'time': time,
'score': score
})
df_mixed = pd.DataFrame(data_mixed)
# ์๊ฐํ
fig, ax = plt.subplots(figsize=(10, 5))
for group in ['Experimental', 'Control']:
means = df_mixed[df_mixed['group'] == group].groupby('time')['score'].mean()
sems = df_mixed[df_mixed['group'] == group].groupby('time')['score'].sem()
means = means.loc[['Pre', 'Mid', 'Post']]
sems = sems.loc[['Pre', 'Mid', 'Post']]
ax.errorbar(range(3), means.values, yerr=sems.values, fmt='o-',
label=group, markersize=10, capsize=5)
ax.set_xticks(range(3))
ax.set_xticklabels(['Pre', 'Mid', 'Post'])
ax.set_xlabel('์์ ')
ax.set_ylabel('์ ์')
ax.set_title('ํผํฉ ์ค๊ณ: ๊ทธ๋ฃน ร ์์ ')
ax.legend()
ax.grid(alpha=0.3)
plt.show()
# ํผํฉ ANOVA
mixed_anova = pg.mixed_anova(data=df_mixed, dv='score', between='group',
within='time', subject='subject')
print("ํผํฉ ์ค๊ณ ANOVA:")
print(mixed_anova)
6. ๋น๋ชจ์ ๋์¶
6.1 Kruskal-Wallis ๊ฒ์ ¶
# ์ ๊ท์ฑ ๊ฐ์ ์ ์ถฉ์กฑํ์ง ๋ชปํ ๋
# ๋น์ ๊ท ๋ฐ์ดํฐ ์์ฑ
np.random.seed(42)
group1_nonnorm = np.random.exponential(10, 30)
group2_nonnorm = np.random.exponential(15, 30)
group3_nonnorm = np.random.exponential(20, 30)
# ์ ๊ท์ฑ ๊ฒ์
print("์ ๊ท์ฑ ๊ฒ์ :")
for i, g in enumerate([group1_nonnorm, group2_nonnorm, group3_nonnorm], 1):
stat, p = stats.shapiro(g)
print(f" ๊ทธ๋ฃน {i}: p = {p:.4f}")
# Kruskal-Wallis ๊ฒ์
H_stat, p_kw = stats.kruskal(group1_nonnorm, group2_nonnorm, group3_nonnorm)
print(f"\nKruskal-Wallis ๊ฒ์ :")
print(f" H = {H_stat:.3f}, p = {p_kw:.4f}")
# ๋น๊ต: ์ผ์ ANOVA
F_stat, p_anova = stats.f_oneway(group1_nonnorm, group2_nonnorm, group3_nonnorm)
print(f"\n์ผ์ ANOVA (๋น๊ต์ฉ):")
print(f" F = {F_stat:.3f}, p = {p_anova:.4f}")
# ์ฌํ๊ฒ์ : Dunn's test
df_nonnorm = pd.DataFrame({
'value': np.concatenate([group1_nonnorm, group2_nonnorm, group3_nonnorm]),
'group': ['G1']*30 + ['G2']*30 + ['G3']*30
})
dunn_result = pg.pairwise_tests(data=df_nonnorm, dv='value', between='group',
parametric=False, padjust='bonf')
print("\nDunn's ์ฌํ๊ฒ์ :")
print(dunn_result[['A', 'B', 'U-val', 'p-unc', 'p-corr']])
6.2 Friedman ๊ฒ์ ¶
# ๋ฐ๋ณต์ธก์ ์ ๋น๋ชจ์ ๋์
# ๋น์ ๊ท ๋ฐ๋ณต์ธก์ ๋ฐ์ดํฐ
np.random.seed(42)
n_subjects = 20
cond1 = np.random.exponential(10, n_subjects)
cond2 = np.random.exponential(15, n_subjects) + cond1 * 0.3 # ์๊ด ์๋ ์ธก์
cond3 = np.random.exponential(20, n_subjects) + cond1 * 0.3
# Friedman ๊ฒ์
chi2_stat, p_friedman = stats.friedmanchisquare(cond1, cond2, cond3)
print(f"Friedman ๊ฒ์ :")
print(f" ฯยฒ = {chi2_stat:.3f}, p = {p_friedman:.4f}")
# ํจ๊ณผํฌ๊ธฐ: Kendall's W
n_conditions = 3
W = chi2_stat / (n_subjects * (n_conditions - 1))
print(f" Kendall's W = {W:.4f}")
์ฐ์ต ๋ฌธ์ ¶
๋ฌธ์ 1: ์ผ์ ANOVA¶
์ธ ๊ฐ์ง ๋น๋ฃ(A, B, C)๊ฐ ์๋ฌผ ์ฑ์ฅ์ ๋ฏธ์น๋ ํจ๊ณผ๋ฅผ ๋ถ์ํ์์ค.
fertilizer_A = [20, 22, 19, 24, 25, 23, 21, 22, 26, 24]
fertilizer_B = [28, 30, 27, 29, 31, 28, 30, 29, 32, 31]
fertilizer_C = [25, 27, 26, 28, 24, 26, 27, 25, 29, 26]
๋ฌธ์ 2: ์ด์ ANOVA¶
์ฑ๋ณ(๋จ/์ฌ) ร ํ์ต๋ฐฉ๋ฒ(์จ๋ผ์ธ/์คํ๋ผ์ธ)์ ํจ๊ณผ๋ฅผ ๋ถ์ํ์์ค. - ์ฃผํจ๊ณผ์ ์ํธ์์ฉ ํจ๊ณผ๋ฅผ ํด์ํ์์ค. - ์ํธ์์ฉ ํ๋กฏ์ ๊ทธ๋ฆฌ์์ค.
๋ฌธ์ 3: ์ฌํ๊ฒ์ ¶
๋ฌธ์ 1์ ๊ฒฐ๊ณผ๊ฐ ์ ์ํ ๊ฒฝ์ฐ, Tukey HSD ์ฌํ๊ฒ์ ์ ์ํํ๊ณ ํด์ํ์์ค.
์ ๋ฆฌ¶
| ANOVA ์ ํ | ์ค๊ณ | Python ํจ์ |
|---|---|---|
| ์ผ์๋ฐฐ์น | 1 ์์ธ, k ์์ค | stats.f_oneway(), pg.anova() |
| ์ด์๋ฐฐ์น | 2 ์์ธ | ols() + anova_lm() |
| ๋ฐ๋ณต์ธก์ | ํผํ์ ๋ด ์์ธ | pg.rm_anova() |
| ํผํฉ์ค๊ณ | ํผํ์ ๊ฐ+๋ด | pg.mixed_anova() |
| Kruskal-Wallis | ๋น๋ชจ์ ์ผ์ | stats.kruskal() |
| Friedman | ๋น๋ชจ์ ๋ฐ๋ณต์ธก์ | stats.friedmanchisquare() |
| ์ฌํ๊ฒ์ | ํน์ง | ํจ์ |
|---|---|---|
| Tukey HSD | ๋ชจ๋ ์ ๋น๊ต | pairwise_tukeyhsd() |
| Bonferroni | ๋ณด์์ | pg.pairwise_tests(padjust='bonf') |
| Dunnett | ๋์กฐ๊ตฐ ๋น๊ต | stats.dunnett() |
| Games-Howell | ๋ฑ๋ถ์ฐ X | pg.pairwise_gameshowell() |