15. ๋ถ„์‚ฐ๋ถ„์„ (ANOVA - Analysis of Variance)

15. ๋ถ„์‚ฐ๋ถ„์„ (ANOVA - Analysis of Variance)

์ด์ „: ๊ฐ€์„ค๊ฒ€์ • ์‹ฌํ™” | ๋‹ค์Œ: ํšŒ๊ท€๋ถ„์„ ์‹ฌํ™”

๊ฐœ์š”

๋ถ„์‚ฐ๋ถ„์„(ANOVA)์€ ์„ธ ๊ฐœ ์ด์ƒ์˜ ๊ทธ๋ฃน ํ‰๊ท ์„ ๋™์‹œ์— ๋น„๊ตํ•˜๋Š” ํ†ต๊ณ„ ๊ธฐ๋ฒ•์ž…๋‹ˆ๋‹ค. ์—ฌ๋Ÿฌ t-๊ฒ€์ •์„ ๋ฐ˜๋ณตํ•˜๋Š” ๊ฒƒ๋ณด๋‹ค ์ œ1์ข… ์˜ค๋ฅ˜๋ฅผ ํ†ต์ œํ•˜๋ฉด์„œ ํšจ์œจ์ ์œผ๋กœ ๋ถ„์„ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.


1. ์ผ์›๋ฐฐ์น˜ ๋ถ„์‚ฐ๋ถ„์„ (One-way ANOVA)

1.1 ๊ธฐ๋ณธ ๊ฐœ๋…

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import pingouin as pg
import statsmodels.api as sm
from statsmodels.formula.api import ols

# ANOVA์˜ ๊ธฐ๋ณธ ์•„์ด๋””์–ด:
# ์ด ๋ณ€๋™ = ๊ทธ๋ฃน ๊ฐ„ ๋ณ€๋™ + ๊ทธ๋ฃน ๋‚ด ๋ณ€๋™
# Hโ‚€: ฮผโ‚ = ฮผโ‚‚ = ... = ฮผโ‚– (๋ชจ๋“  ๊ทธ๋ฃน ํ‰๊ท ์ด ๊ฐ™๋‹ค)
# Hโ‚: ์ ์–ด๋„ ํ•˜๋‚˜์˜ ๊ทธ๋ฃน ํ‰๊ท ์ด ๋‹ค๋ฅด๋‹ค

# ์˜ˆ์‹œ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
np.random.seed(42)

# ์„ธ ๊ฐ€์ง€ ๊ต์ˆ˜๋ฒ•์˜ ํšจ๊ณผ ๋น„๊ต
method_A = np.random.normal(75, 10, 25)  # ์ „ํ†ต์  ๊ต์ˆ˜๋ฒ•
method_B = np.random.normal(80, 10, 25)  # ํ† ๋ก ์‹ ๊ต์ˆ˜๋ฒ•
method_C = np.random.normal(85, 10, 25)  # ํ”Œ๋ฆฝ๋Ÿฌ๋‹

# DataFrame ์ƒ์„ฑ
df = pd.DataFrame({
    'score': np.concatenate([method_A, method_B, method_C]),
    'method': ['A']*25 + ['B']*25 + ['C']*25
})

# ์‹œ๊ฐํ™”
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
sns.boxplot(data=df, x='method', y='score', ax=axes[0], palette='Set2')
axes[0].set_xlabel('๊ต์ˆ˜๋ฒ•')
axes[0].set_ylabel('์ ์ˆ˜')
axes[0].set_title('๊ต์ˆ˜๋ฒ•๋ณ„ ์ ์ˆ˜ ๋ถ„ํฌ')

# ๊ฐ ๊ทธ๋ฃน์˜ ํ‰๊ท ๊ณผ ๊ฐœ๋ณ„ ์ ์ˆ˜
for i, method in enumerate(['A', 'B', 'C']):
    data = df[df['method'] == method]['score']
    axes[1].scatter([i]*len(data), data, alpha=0.5, s=30)
    axes[1].scatter([i], [data.mean()], color='red', s=100, marker='D', zorder=5)

axes[1].set_xticks([0, 1, 2])
axes[1].set_xticklabels(['A', 'B', 'C'])
axes[1].set_xlabel('๊ต์ˆ˜๋ฒ•')
axes[1].set_ylabel('์ ์ˆ˜')
axes[1].set_title('๊ฐœ๋ณ„ ์ ์ˆ˜์™€ ๊ทธ๋ฃน ํ‰๊ท ')
axes[1].axhline(df['score'].mean(), color='blue', linestyle='--', label='์ „์ฒด ํ‰๊ท ')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"์ „์ฒด ํ‰๊ท : {df['score'].mean():.2f}")
print(f"๊ทธ๋ฃน๋ณ„ ํ‰๊ท :")
print(df.groupby('method')['score'].mean())

1.2 ANOVA ๊ณ„์‚ฐ

def one_way_anova_manual(groups):
    """์ผ์›๋ฐฐ์น˜ ANOVA ์ˆ˜๋™ ๊ณ„์‚ฐ"""
    # ์ „์ฒด ๋ฐ์ดํ„ฐ
    all_data = np.concatenate(groups)
    n_total = len(all_data)
    k = len(groups)  # ๊ทธ๋ฃน ์ˆ˜
    grand_mean = all_data.mean()

    # ๊ทธ๋ฃน ๊ฐ„ ๋ณ€๋™ (SSB: Sum of Squares Between)
    SSB = sum(len(g) * (g.mean() - grand_mean)**2 for g in groups)

    # ๊ทธ๋ฃน ๋‚ด ๋ณ€๋™ (SSW: Sum of Squares Within)
    SSW = sum(sum((x - g.mean())**2 for x in g) for g in groups)

    # ์ด ๋ณ€๋™ (SST: Sum of Squares Total)
    SST = sum((x - grand_mean)**2 for x in all_data)

    # ์ž์œ ๋„
    df_between = k - 1
    df_within = n_total - k
    df_total = n_total - 1

    # ํ‰๊ท ์ œ๊ณฑ (Mean Squares)
    MSB = SSB / df_between
    MSW = SSW / df_within

    # F-ํ†ต๊ณ„๋Ÿ‰
    F_stat = MSB / MSW

    # p-value
    p_value = 1 - stats.f.cdf(F_stat, df_between, df_within)

    return {
        'SSB': SSB, 'SSW': SSW, 'SST': SST,
        'df_between': df_between, 'df_within': df_within,
        'MSB': MSB, 'MSW': MSW,
        'F_stat': F_stat, 'p_value': p_value
    }

# ๊ณ„์‚ฐ
groups = [method_A, method_B, method_C]
result = one_way_anova_manual(groups)

print("์ผ์›๋ฐฐ์น˜ ANOVA ๊ฒฐ๊ณผ (์ˆ˜๋™ ๊ณ„์‚ฐ):")
print("=" * 60)
print("๋ถ„์‚ฐ๋ถ„์„ํ‘œ:")
print("-" * 60)
print(f"{'๋ณ€๋™์›':<12} {'SS':<12} {'df':<8} {'MS':<12} {'F':<10} {'p-value':<10}")
print("-" * 60)
print(f"{'์ฒ˜๋ฆฌ(๊ทธ๋ฃน๊ฐ„)':<12} {result['SSB']:<12.2f} {result['df_between']:<8} "
      f"{result['MSB']:<12.2f} {result['F_stat']:<10.3f} {result['p_value']:<10.4f}")
print(f"{'์˜ค์ฐจ(๊ทธ๋ฃน๋‚ด)':<12} {result['SSW']:<12.2f} {result['df_within']:<8} {result['MSW']:<12.2f}")
print(f"{'์ด๊ณ„':<12} {result['SST']:<12.2f} {result['df_between'] + result['df_within']:<8}")
print("=" * 60)

1.3 scipy์™€ statsmodels ํ™œ์šฉ

# scipy.stats.f_oneway
F_scipy, p_scipy = stats.f_oneway(method_A, method_B, method_C)
print(f"scipy.stats.f_oneway:")
print(f"  F = {F_scipy:.3f}, p = {p_scipy:.4f}")

# statsmodels OLS๋ฅผ ์ด์šฉํ•œ ANOVA
model = ols('score ~ C(method)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

print(f"\nstatsmodels ANOVA Table:")
print(anova_table)

# pingouin
anova_pg = pg.anova(data=df, dv='score', between='method', detailed=True)
print(f"\npingouin ANOVA:")
print(anova_pg)

1.4 F-๋ถ„ํฌ ์ดํ•ด

# F-๋ถ„ํฌ: ๋‘ ์นด์ด์ œ๊ณฑ ๋ณ€์ˆ˜์˜ ๋น„์œจ
# F = (ฯ‡ยฒโ‚/dfโ‚) / (ฯ‡ยฒโ‚‚/dfโ‚‚)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# F-๋ถ„ํฌ (๋‹ค์–‘ํ•œ ์ž์œ ๋„)
x = np.linspace(0, 5, 200)
df_pairs = [(2, 20), (5, 20), (10, 20), (5, 50)]

for df1, df2 in df_pairs:
    axes[0].plot(x, stats.f.pdf(x, df1, df2), label=f'F({df1},{df2})')

axes[0].set_xlabel('F')
axes[0].set_ylabel('๋ฐ€๋„')
axes[0].set_title('F-๋ถ„ํฌ (๋‹ค์–‘ํ•œ ์ž์œ ๋„)')
axes[0].legend()
axes[0].grid(alpha=0.3)

# ํ˜„์žฌ ๋ถ„์„์˜ F-๋ถ„ํฌ์™€ ๊ด€์ฐฐ๋œ F-๊ฐ’
df1, df2 = result['df_between'], result['df_within']
x = np.linspace(0, 8, 200)
y = stats.f.pdf(x, df1, df2)

axes[1].plot(x, y, 'b-', lw=2, label=f'F({df1},{df2})')
axes[1].fill_between(x, y, where=(x >= result['F_stat']), alpha=0.3, color='red',
                      label=f'p-value = {result["p_value"]:.4f}')
axes[1].axvline(result['F_stat'], color='red', linestyle='--',
                label=f'F = {result["F_stat"]:.2f}')

# ์ž„๊ณ„๊ฐ’
f_critical = stats.f.ppf(0.95, df1, df2)
axes[1].axvline(f_critical, color='green', linestyle=':',
                label=f'F_crit (ฮฑ=0.05) = {f_critical:.2f}')

axes[1].set_xlabel('F')
axes[1].set_ylabel('๋ฐ€๋„')
axes[1].set_title('F-๋ถ„ํฌ์™€ ๊ฒ€์ • ๊ฒฐ๊ณผ')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

2. ANOVA ๊ฐ€์ • ํ™•์ธ

2.1 ์ •๊ทœ์„ฑ ๊ฒ€์ •

# ๊ฐ ๊ทธ๋ฃน์˜ ์ •๊ทœ์„ฑ ๊ฒ€์ •

print("์ •๊ทœ์„ฑ ๊ฒ€์ • (Shapiro-Wilk):")
print("-" * 40)

for method in ['A', 'B', 'C']:
    data = df[df['method'] == method]['score']
    stat, p = stats.shapiro(data)
    print(f"๊ทธ๋ฃน {method}: W = {stat:.4f}, p = {p:.4f}")

# ์ž”์ฐจ์˜ ์ •๊ทœ์„ฑ (๋” ์ค‘์š”)
residuals = model.resid
stat, p = stats.shapiro(residuals)
print(f"\n์ž”์ฐจ ์ •๊ทœ์„ฑ: W = {stat:.4f}, p = {p:.4f}")

# Q-Q plot
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# ๊ทธ๋ฃน๋ณ„ Q-Q plot
for i, method in enumerate(['A', 'B', 'C']):
    data = df[df['method'] == method]['score']
    stats.probplot(data, dist="norm", plot=axes[0])

axes[0].set_title('๊ทธ๋ฃน๋ณ„ Q-Q Plot')

# ์ž”์ฐจ Q-Q plot
stats.probplot(residuals, dist="norm", plot=axes[1])
axes[1].set_title('์ž”์ฐจ Q-Q Plot')

plt.tight_layout()
plt.show()

2.2 ๋“ฑ๋ถ„์‚ฐ์„ฑ ๊ฒ€์ •

# Levene's test (์ค‘์•™๊ฐ’ ๊ธฐ๋ฐ˜, ๋” ๋กœ๋ฒ„์ŠคํŠธ)
stat_levene, p_levene = stats.levene(method_A, method_B, method_C)
print(f"Levene's Test: F = {stat_levene:.4f}, p = {p_levene:.4f}")

# Bartlett's test (์ •๊ทœ์„ฑ ๊ฐ€์ •)
stat_bartlett, p_bartlett = stats.bartlett(method_A, method_B, method_C)
print(f"Bartlett's Test: ฯ‡ยฒ = {stat_bartlett:.4f}, p = {p_bartlett:.4f}")

# pingouin
homoscedasticity_result = pg.homoscedasticity(df, dv='score', group='method')
print(f"\npingouin ๋“ฑ๋ถ„์‚ฐ์„ฑ ๊ฒ€์ •:")
print(homoscedasticity_result)

# ์ž”์ฐจ vs ์ ํ•ฉ๊ฐ’ ๊ทธ๋ฆผ
fig, ax = plt.subplots(figsize=(10, 5))

fitted = model.fittedvalues
residuals = model.resid

ax.scatter(fitted, residuals, alpha=0.6)
ax.axhline(0, color='red', linestyle='--')
ax.set_xlabel('์ ํ•ฉ๊ฐ’')
ax.set_ylabel('์ž”์ฐจ')
ax.set_title('์ž”์ฐจ vs ์ ํ•ฉ๊ฐ’ (๋“ฑ๋ถ„์‚ฐ์„ฑ ํ™•์ธ)')

# ๊ฐ ๊ทธ๋ฃน์˜ ๋ถ„์‚ฐ ํ‘œ์‹œ
for method in ['A', 'B', 'C']:
    data = df[df['method'] == method]
    mean_val = data['score'].mean()
    ax.axvline(mean_val, color='gray', linestyle=':', alpha=0.5)

plt.show()

print("\n๊ทธ๋ฃน๋ณ„ ํ‘œ์ค€ํŽธ์ฐจ:")
print(df.groupby('method')['score'].std())

2.3 ๋“ฑ๋ถ„์‚ฐ ๊ฐ€์ • ์œ„๋ฐ˜ ์‹œ: Welch's ANOVA

# ๋“ฑ๋ถ„์‚ฐ ๊ฐ€์ •์ด ์œ„๋ฐ˜๋œ ๊ฒฝ์šฐ
np.random.seed(42)

# ๋ถ„์‚ฐ์ด ๋‹ค๋ฅธ ๊ทธ๋ฃน๋“ค
group1 = np.random.normal(50, 5, 30)   # ฯƒ = 5
group2 = np.random.normal(55, 15, 30)  # ฯƒ = 15
group3 = np.random.normal(60, 25, 30)  # ฯƒ = 25

df_hetero = pd.DataFrame({
    'value': np.concatenate([group1, group2, group3]),
    'group': ['G1']*30 + ['G2']*30 + ['G3']*30
})

# ๋“ฑ๋ถ„์‚ฐ์„ฑ ๊ฒ€์ •
stat, p = stats.levene(group1, group2, group3)
print(f"Levene's Test: p = {p:.4f} (๋“ฑ๋ถ„์‚ฐ ๊ฐ€์ • ์œ„๋ฐ˜)")

# ์ผ๋ฐ˜ ANOVA
F_normal, p_normal = stats.f_oneway(group1, group2, group3)
print(f"\n์ผ๋ฐ˜ ANOVA: F = {F_normal:.3f}, p = {p_normal:.4f}")

# Welch's ANOVA (pingouin)
welch_result = pg.welch_anova(data=df_hetero, dv='value', between='group')
print(f"\nWelch's ANOVA:")
print(welch_result)

# Games-Howell ์‚ฌํ›„๊ฒ€์ • (๋“ฑ๋ถ„์‚ฐ ๊ฐ€์ • X)
games_howell = pg.pairwise_gameshowell(data=df_hetero, dv='value', between='group')
print(f"\nGames-Howell ์‚ฌํ›„๊ฒ€์ •:")
print(games_howell)

3. ์‚ฌํ›„๊ฒ€์ • (Post-hoc Tests)

3.1 Tukey HSD

# ANOVA๊ฐ€ ์œ ์˜ํ•˜๋ฉด โ†’ ์–ด๋–ค ๊ทธ๋ฃน์ด ๋‹ค๋ฅธ์ง€ ํ™•์ธ
# Tukey's Honest Significant Difference

from statsmodels.stats.multicomp import pairwise_tukeyhsd

# ์›๋ณธ ๋ฐ์ดํ„ฐ๋กœ
tukey_result = pairwise_tukeyhsd(df['score'], df['method'], alpha=0.05)

print("Tukey HSD ์‚ฌํ›„๊ฒ€์ •:")
print(tukey_result)

# ์‹œ๊ฐํ™”
fig = tukey_result.plot_simultaneous(figsize=(10, 4))
plt.title('Tukey HSD: 95% ์‹ ๋ขฐ๊ตฌ๊ฐ„')
plt.xlabel('์ ์ˆ˜')
plt.tight_layout()
plt.show()

# pingouin ์‚ฌ์šฉ
tukey_pg = pg.pairwise_tukey(data=df, dv='score', between='method')
print("\npingouin Tukey HSD:")
print(tukey_pg)

3.2 ๋‹ค์–‘ํ•œ ์‚ฌํ›„๊ฒ€์ • ๋ฐฉ๋ฒ•

# ๋‹ค์–‘ํ•œ ์‚ฌํ›„๊ฒ€์ • ๋น„๊ต

print("๋‹ค์–‘ํ•œ ์‚ฌํ›„๊ฒ€์ • ๋ฐฉ๋ฒ• ๋น„๊ต:")
print("=" * 70)

# 1. Bonferroni
bonf = pg.pairwise_tests(data=df, dv='score', between='method',
                          padjust='bonf', effsize='cohen')
print("\n1. Bonferroni:")
print(bonf[['A', 'B', 'T', 'p-unc', 'p-corr', 'cohen']])

# 2. Holm
holm = pg.pairwise_tests(data=df, dv='score', between='method',
                          padjust='holm', effsize='cohen')
print("\n2. Holm:")
print(holm[['A', 'B', 'p-unc', 'p-corr']])

# 3. Scheffe (๋” ๋ณด์ˆ˜์ )
# statsmodels์—์„œ ์ง์ ‘ ๊ณ„์‚ฐ ํ•„์š”

# 4. Dunnett (๋Œ€์กฐ๊ตฐ ๋น„๊ต)
# A๋ฅผ ๋Œ€์กฐ๊ตฐ์œผ๋กœ ์„ค์ •
from scipy.stats import dunnett

dunnett_result = dunnett(method_B, method_C, control=method_A)
print("\n4. Dunnett (A๊ฐ€ ๋Œ€์กฐ๊ตฐ):")
print(f"  B vs A: statistic={dunnett_result.statistic[0]:.3f}, p={dunnett_result.pvalue[0]:.4f}")
print(f"  C vs A: statistic={dunnett_result.statistic[1]:.3f}, p={dunnett_result.pvalue[1]:.4f}")

3.3 ํšจ๊ณผํฌ๊ธฐ

# ANOVA ํšจ๊ณผํฌ๊ธฐ

# Eta-squared (ฮทยฒ)
ss_between = result['SSB']
ss_total = result['SST']
eta_squared = ss_between / ss_total

# Omega-squared (ฯ‰ยฒ) - ํŽธํ–ฅ ๋ณด์ •
ms_within = result['MSW']
n_total = len(df)
k = 3
omega_squared = (ss_between - (k-1)*ms_within) / (ss_total + ms_within)

# Partial eta-squared (์ด ๊ฒฝ์šฐ eta-squared์™€ ๋™์ผ)
partial_eta_squared = ss_between / (ss_between + result['SSW'])

print("ANOVA ํšจ๊ณผํฌ๊ธฐ:")
print("-" * 40)
print(f"ฮทยฒ (Eta-squared): {eta_squared:.4f}")
print(f"ฯ‰ยฒ (Omega-squared): {omega_squared:.4f}")
print(f"Partial ฮทยฒ: {partial_eta_squared:.4f}")

print("\nํ•ด์„ ๊ธฐ์ค€ (Cohen):")
print("  ฮทยฒ โ‰ˆ 0.01: ์ž‘์€ ํšจ๊ณผ")
print("  ฮทยฒ โ‰ˆ 0.06: ์ค‘๊ฐ„ ํšจ๊ณผ")
print("  ฮทยฒ โ‰ˆ 0.14: ํฐ ํšจ๊ณผ")

# pingouin ๊ฒฐ๊ณผ์—์„œ ํšจ๊ณผํฌ๊ธฐ
print(f"\npingouin ๊ฒฐ๊ณผ์˜ ฮทยฒ: {anova_pg['np2'].values[0]:.4f}")

4. ์ด์›๋ฐฐ์น˜ ๋ถ„์‚ฐ๋ถ„์„ (Two-way ANOVA)

4.1 ๊ธฐ๋ณธ ๊ฐœ๋…

# ๋‘ ๊ฐ€์ง€ ๋…๋ฆฝ๋ณ€์ˆ˜(์š”์ธ)์˜ ํšจ๊ณผ๋ฅผ ๋™์‹œ์— ๋ถ„์„
# ์ฃผํšจ๊ณผ(Main Effect) + ์ƒํ˜ธ์ž‘์šฉ(Interaction)

np.random.seed(42)

# ์˜ˆ์‹œ: ๊ต์ˆ˜๋ฒ•(A, B) ร— ํ•™์Šต์‹œ๊ฐ„(Low, High)์˜ ํšจ๊ณผ
n_per_cell = 20

data = {
    'score': [],
    'method': [],
    'time': []
}

# 2x2 ์„ค๊ณ„
# ์ƒํ˜ธ์ž‘์šฉ ํšจ๊ณผ ํฌํ•จ
effects = {
    ('A', 'Low'): 70,
    ('A', 'High'): 80,
    ('B', 'Low'): 75,
    ('B', 'High'): 90,  # B + High์˜ ์‹œ๋„ˆ์ง€ ํšจ๊ณผ
}

for (method, time), mean in effects.items():
    scores = np.random.normal(mean, 8, n_per_cell)
    data['score'].extend(scores)
    data['method'].extend([method] * n_per_cell)
    data['time'].extend([time] * n_per_cell)

df_two = pd.DataFrame(data)

# ์…€ ํ‰๊ท  ํ™•์ธ
cell_means = df_two.groupby(['method', 'time'])['score'].mean().unstack()
print("์…€ ํ‰๊ท :")
print(cell_means)

# ์‹œ๊ฐํ™”
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
sns.boxplot(data=df_two, x='method', y='score', hue='time', ax=axes[0])
axes[0].set_title('๊ต์ˆ˜๋ฒ• ร— ํ•™์Šต์‹œ๊ฐ„')
axes[0].legend(title='ํ•™์Šต์‹œ๊ฐ„')

# ์ƒํ˜ธ์ž‘์šฉ ํ”Œ๋กฏ
for time in ['Low', 'High']:
    means = df_two[df_two['time'] == time].groupby('method')['score'].mean()
    axes[1].plot(['A', 'B'], means.values, 'o-', label=time, markersize=10)

axes[1].set_xlabel('๊ต์ˆ˜๋ฒ•')
axes[1].set_ylabel('ํ‰๊ท  ์ ์ˆ˜')
axes[1].set_title('์ƒํ˜ธ์ž‘์šฉ ํ”Œ๋กฏ')
axes[1].legend(title='ํ•™์Šต์‹œ๊ฐ„')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

4.2 ์ด์› ANOVA ๋ถ„์„

# statsmodels๋ฅผ ์ด์šฉํ•œ ์ด์› ANOVA
model_two = ols('score ~ C(method) * C(time)', data=df_two).fit()
anova_two = sm.stats.anova_lm(model_two, typ=2)

print("์ด์›๋ฐฐ์น˜ ANOVA ๊ฒฐ๊ณผ:")
print(anova_two)

# pingouin
anova_two_pg = pg.anova(data=df_two, dv='score', between=['method', 'time'])
print("\npingouin ์ด์› ANOVA:")
print(anova_two_pg)

# ํšจ๊ณผ ํ•ด์„
print("\nํšจ๊ณผ ํ•ด์„:")
print("-" * 50)
for idx, row in anova_two.iterrows():
    if 'Residual' not in idx:
        sig = "***" if row['PR(>F)'] < 0.001 else ("**" if row['PR(>F)'] < 0.01 else ("*" if row['PR(>F)'] < 0.05 else ""))
        print(f"{idx}: F = {row['F']:.2f}, p = {row['PR(>F)']:.4f} {sig}")

4.3 ์ƒํ˜ธ์ž‘์šฉ ํšจ๊ณผ ํ•ด์„

# ์ƒํ˜ธ์ž‘์šฉ์ด ์œ ์˜ํ•  ๋•Œ: ๋‹จ์ˆœ ์ฃผํšจ๊ณผ ๋ถ„์„

# ํ•™์Šต์‹œ๊ฐ„๋ณ„ ๊ต์ˆ˜๋ฒ• ํšจ๊ณผ
print("๋‹จ์ˆœ ์ฃผํšจ๊ณผ ๋ถ„์„: ํ•™์Šต์‹œ๊ฐ„๋ณ„ ๊ต์ˆ˜๋ฒ• ํšจ๊ณผ")
print("-" * 50)

for time in ['Low', 'High']:
    subset = df_two[df_two['time'] == time]
    t_stat, p_val = stats.ttest_ind(
        subset[subset['method'] == 'A']['score'],
        subset[subset['method'] == 'B']['score']
    )
    print(f"{time} ํ•™์Šต์‹œ๊ฐ„: t = {t_stat:.3f}, p = {p_val:.4f}")

# ๊ต์ˆ˜๋ฒ•๋ณ„ ํ•™์Šต์‹œ๊ฐ„ ํšจ๊ณผ
print("\n๋‹จ์ˆœ ์ฃผํšจ๊ณผ ๋ถ„์„: ๊ต์ˆ˜๋ฒ•๋ณ„ ํ•™์Šต์‹œ๊ฐ„ ํšจ๊ณผ")
print("-" * 50)

for method in ['A', 'B']:
    subset = df_two[df_two['method'] == method]
    t_stat, p_val = stats.ttest_ind(
        subset[subset['time'] == 'Low']['score'],
        subset[subset['time'] == 'High']['score']
    )
    d = pg.compute_effsize(
        subset[subset['time'] == 'High']['score'],
        subset[subset['time'] == 'Low']['score'],
        eftype='cohen'
    )
    print(f"๊ต์ˆ˜๋ฒ• {method}: t = {t_stat:.3f}, p = {p_val:.4f}, d = {d:.3f}")

4.4 ์ƒํ˜ธ์ž‘์šฉ ์—†๋Š” ๊ฒฝ์šฐ

# ์ƒํ˜ธ์ž‘์šฉ์ด ์—†๋Š” ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
np.random.seed(42)

data_no_int = {
    'score': [],
    'method': [],
    'time': []
}

# ์ƒํ˜ธ์ž‘์šฉ ์—†์Œ: ๊ฐ ์š”์ธ์˜ ํšจ๊ณผ๊ฐ€ ๋…๋ฆฝ์ 
effects_no_int = {
    ('A', 'Low'): 70,
    ('A', 'High'): 80,   # +10 (์‹œ๊ฐ„ ํšจ๊ณผ)
    ('B', 'Low'): 78,    # +8 (๋ฐฉ๋ฒ• ํšจ๊ณผ)
    ('B', 'High'): 88,   # +8 + 10 (๊ฐ€์‚ฐ์ )
}

for (method, time), mean in effects_no_int.items():
    scores = np.random.normal(mean, 8, 20)
    data_no_int['score'].extend(scores)
    data_no_int['method'].extend([method] * 20)
    data_no_int['time'].extend([time] * 20)

df_no_int = pd.DataFrame(data_no_int)

# ์ƒํ˜ธ์ž‘์šฉ ํ”Œ๋กฏ
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# ์ƒํ˜ธ์ž‘์šฉ ์žˆ๋Š” ๊ฒฝ์šฐ
for time in ['Low', 'High']:
    means = df_two[df_two['time'] == time].groupby('method')['score'].mean()
    axes[0].plot(['A', 'B'], means.values, 'o-', label=time, markersize=10)
axes[0].set_xlabel('๊ต์ˆ˜๋ฒ•')
axes[0].set_ylabel('ํ‰๊ท  ์ ์ˆ˜')
axes[0].set_title('์ƒํ˜ธ์ž‘์šฉ ์žˆ์Œ (์„ ์ด ๊ต์ฐจ/๋น„ํ‰ํ–‰)')
axes[0].legend(title='ํ•™์Šต์‹œ๊ฐ„')
axes[0].grid(alpha=0.3)

# ์ƒํ˜ธ์ž‘์šฉ ์—†๋Š” ๊ฒฝ์šฐ
for time in ['Low', 'High']:
    means = df_no_int[df_no_int['time'] == time].groupby('method')['score'].mean()
    axes[1].plot(['A', 'B'], means.values, 'o-', label=time, markersize=10)
axes[1].set_xlabel('๊ต์ˆ˜๋ฒ•')
axes[1].set_ylabel('ํ‰๊ท  ์ ์ˆ˜')
axes[1].set_title('์ƒํ˜ธ์ž‘์šฉ ์—†์Œ (์„ ์ด ํ‰ํ–‰)')
axes[1].legend(title='ํ•™์Šต์‹œ๊ฐ„')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

# ANOVA ๋น„๊ต
print("์ƒํ˜ธ์ž‘์šฉ ์—†๋Š” ๊ฒฝ์šฐ ANOVA:")
model_no_int = ols('score ~ C(method) * C(time)', data=df_no_int).fit()
print(sm.stats.anova_lm(model_no_int, typ=2))

5. ๋ฐ˜๋ณต์ธก์ • ๋ถ„์‚ฐ๋ถ„์„ (Repeated Measures ANOVA)

5.1 ๊ธฐ๋ณธ ๊ฐœ๋…

# ๊ฐ™์€ ํ”ผํ—˜์ž๋ฅผ ์—ฌ๋Ÿฌ ์กฐ๊ฑด์—์„œ ์ธก์ •
# ํ”ผํ—˜์ž ๋‚ด ์„ค๊ณ„ (within-subjects design)

np.random.seed(42)

# ์˜ˆ์‹œ: ๋™์ผ ํ”ผํ—˜์ž๊ฐ€ 3๊ฐ€์ง€ ์•ฝ๋ฌผ ์กฐ๊ฑด์—์„œ ์ธก์ •
n_subjects = 20

# ๊ฐœ์ธ์ฐจ (ํ”ผํ—˜์ž ํšจ๊ณผ)
subject_effect = np.random.normal(0, 10, n_subjects)

# ๊ฐ ์กฐ๊ฑด์˜ ํ‰๊ท  ํšจ๊ณผ
condition_effects = {'Placebo': 0, 'Drug_A': 5, 'Drug_B': 10}

data_rm = []
for subj in range(n_subjects):
    for condition, effect in condition_effects.items():
        score = 50 + subject_effect[subj] + effect + np.random.normal(0, 5)
        data_rm.append({
            'subject': f'S{subj+1:02d}',
            'condition': condition,
            'score': score
        })

df_rm = pd.DataFrame(data_rm)

# ์‹œ๊ฐํ™”
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
sns.boxplot(data=df_rm, x='condition', y='score', ax=axes[0],
            order=['Placebo', 'Drug_A', 'Drug_B'])
axes[0].set_title('์กฐ๊ฑด๋ณ„ ์ ์ˆ˜ ๋ถ„ํฌ')

# ๊ฐœ์ธ๋ณ„ ๋ณ€ํ™”
for subj in df_rm['subject'].unique()[:10]:  # ์ฒ˜์Œ 10๋ช…๋งŒ
    subj_data = df_rm[df_rm['subject'] == subj]
    subj_data = subj_data.set_index('condition').loc[['Placebo', 'Drug_A', 'Drug_B']]
    axes[1].plot(range(3), subj_data['score'].values, 'o-', alpha=0.5)

# ํ‰๊ท  ์ถ”๊ฐ€
means = df_rm.groupby('condition')['score'].mean()
means = means.loc[['Placebo', 'Drug_A', 'Drug_B']]
axes[1].plot(range(3), means.values, 'rs-', markersize=12, linewidth=3, label='ํ‰๊ท ')

axes[1].set_xticks(range(3))
axes[1].set_xticklabels(['Placebo', 'Drug_A', 'Drug_B'])
axes[1].set_xlabel('์กฐ๊ฑด')
axes[1].set_ylabel('์ ์ˆ˜')
axes[1].set_title('๊ฐœ์ธ๋ณ„ ๋ณ€ํ™” ํŒจํ„ด')
axes[1].legend()

plt.tight_layout()
plt.show()

5.2 ๋ฐ˜๋ณต์ธก์ • ANOVA ๋ถ„์„

# pingouin์„ ์ด์šฉํ•œ ๋ฐ˜๋ณต์ธก์ • ANOVA
rm_anova = pg.rm_anova(data=df_rm, dv='score', within='condition',
                        subject='subject', detailed=True)

print("๋ฐ˜๋ณต์ธก์ • ANOVA ๊ฒฐ๊ณผ:")
print(rm_anova)

# ๊ตฌํ˜•์„ฑ ๊ฒ€์ • (Mauchly's test)
# ๋ฐ˜๋ณต์ธก์ • ANOVA์˜ ๊ฐ€์ •
print("\n๊ตฌํ˜•์„ฑ ๊ฐ€์ •:")
print("  ๊ตฌํ˜•์„ฑ์ด ์œ„๋ฐ˜๋˜๋ฉด Greenhouse-Geisser ๋˜๋Š” Huynh-Feldt ๋ณด์ • ์‚ฌ์šฉ")
print(f"  GG epsilon: {rm_anova['eps'].values[0]:.4f}")

# ์‚ฌํ›„๊ฒ€์ •
posthoc_rm = pg.pairwise_tests(data=df_rm, dv='score', within='condition',
                                subject='subject', padjust='bonf')
print("\n์‚ฌํ›„๊ฒ€์ • (Bonferroni):")
print(posthoc_rm[['A', 'B', 'T', 'p-unc', 'p-corr', 'BF10']])

5.3 ํ˜ผํ•ฉ ์„ค๊ณ„ ANOVA

# ํ˜ผํ•ฉ ์„ค๊ณ„: ํ”ผํ—˜์ž ๊ฐ„ ์š”์ธ + ํ”ผํ—˜์ž ๋‚ด ์š”์ธ
np.random.seed(42)

# 2 (๊ทธ๋ฃน: ์‹คํ—˜/ํ†ต์ œ) ร— 3 (์‹œ์ : ์ „/์ค‘/ํ›„) ํ˜ผํ•ฉ ์„ค๊ณ„
n_per_group = 15

data_mixed = []
for group, group_effect in [('Experimental', 5), ('Control', 0)]:
    for subj in range(n_per_group):
        subject_id = f'{group[0]}_{subj+1:02d}'
        base = 50 + np.random.normal(0, 8)

        for time_idx, (time, time_effect) in enumerate([('Pre', 0), ('Mid', 3), ('Post', 6)]):
            # ์ƒํ˜ธ์ž‘์šฉ: ์‹คํ—˜๊ตฐ์€ ์‹œ๊ฐ„์— ๋”ฐ๋ฅธ ํšจ๊ณผ๊ฐ€ ๋” ํผ
            interaction = time_idx * 3 if group == 'Experimental' else 0
            score = base + group_effect + time_effect + interaction + np.random.normal(0, 5)

            data_mixed.append({
                'subject': subject_id,
                'group': group,
                'time': time,
                'score': score
            })

df_mixed = pd.DataFrame(data_mixed)

# ์‹œ๊ฐํ™”
fig, ax = plt.subplots(figsize=(10, 5))

for group in ['Experimental', 'Control']:
    means = df_mixed[df_mixed['group'] == group].groupby('time')['score'].mean()
    sems = df_mixed[df_mixed['group'] == group].groupby('time')['score'].sem()
    means = means.loc[['Pre', 'Mid', 'Post']]
    sems = sems.loc[['Pre', 'Mid', 'Post']]

    ax.errorbar(range(3), means.values, yerr=sems.values, fmt='o-',
                label=group, markersize=10, capsize=5)

ax.set_xticks(range(3))
ax.set_xticklabels(['Pre', 'Mid', 'Post'])
ax.set_xlabel('์‹œ์ ')
ax.set_ylabel('์ ์ˆ˜')
ax.set_title('ํ˜ผํ•ฉ ์„ค๊ณ„: ๊ทธ๋ฃน ร— ์‹œ์ ')
ax.legend()
ax.grid(alpha=0.3)
plt.show()

# ํ˜ผํ•ฉ ANOVA
mixed_anova = pg.mixed_anova(data=df_mixed, dv='score', between='group',
                              within='time', subject='subject')
print("ํ˜ผํ•ฉ ์„ค๊ณ„ ANOVA:")
print(mixed_anova)

6. ๋น„๋ชจ์ˆ˜ ๋Œ€์•ˆ

6.1 Kruskal-Wallis ๊ฒ€์ •

# ์ •๊ทœ์„ฑ ๊ฐ€์ •์„ ์ถฉ์กฑํ•˜์ง€ ๋ชปํ•  ๋•Œ

# ๋น„์ •๊ทœ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
np.random.seed(42)
group1_nonnorm = np.random.exponential(10, 30)
group2_nonnorm = np.random.exponential(15, 30)
group3_nonnorm = np.random.exponential(20, 30)

# ์ •๊ทœ์„ฑ ๊ฒ€์ •
print("์ •๊ทœ์„ฑ ๊ฒ€์ •:")
for i, g in enumerate([group1_nonnorm, group2_nonnorm, group3_nonnorm], 1):
    stat, p = stats.shapiro(g)
    print(f"  ๊ทธ๋ฃน {i}: p = {p:.4f}")

# Kruskal-Wallis ๊ฒ€์ •
H_stat, p_kw = stats.kruskal(group1_nonnorm, group2_nonnorm, group3_nonnorm)
print(f"\nKruskal-Wallis ๊ฒ€์ •:")
print(f"  H = {H_stat:.3f}, p = {p_kw:.4f}")

# ๋น„๊ต: ์ผ์› ANOVA
F_stat, p_anova = stats.f_oneway(group1_nonnorm, group2_nonnorm, group3_nonnorm)
print(f"\n์ผ์› ANOVA (๋น„๊ต์šฉ):")
print(f"  F = {F_stat:.3f}, p = {p_anova:.4f}")

# ์‚ฌํ›„๊ฒ€์ •: Dunn's test
df_nonnorm = pd.DataFrame({
    'value': np.concatenate([group1_nonnorm, group2_nonnorm, group3_nonnorm]),
    'group': ['G1']*30 + ['G2']*30 + ['G3']*30
})

dunn_result = pg.pairwise_tests(data=df_nonnorm, dv='value', between='group',
                                 parametric=False, padjust='bonf')
print("\nDunn's ์‚ฌํ›„๊ฒ€์ •:")
print(dunn_result[['A', 'B', 'U-val', 'p-unc', 'p-corr']])

6.2 Friedman ๊ฒ€์ •

# ๋ฐ˜๋ณต์ธก์ •์˜ ๋น„๋ชจ์ˆ˜ ๋Œ€์•ˆ

# ๋น„์ •๊ทœ ๋ฐ˜๋ณต์ธก์ • ๋ฐ์ดํ„ฐ
np.random.seed(42)
n_subjects = 20

cond1 = np.random.exponential(10, n_subjects)
cond2 = np.random.exponential(15, n_subjects) + cond1 * 0.3  # ์ƒ๊ด€ ์žˆ๋Š” ์ธก์ •
cond3 = np.random.exponential(20, n_subjects) + cond1 * 0.3

# Friedman ๊ฒ€์ •
chi2_stat, p_friedman = stats.friedmanchisquare(cond1, cond2, cond3)
print(f"Friedman ๊ฒ€์ •:")
print(f"  ฯ‡ยฒ = {chi2_stat:.3f}, p = {p_friedman:.4f}")

# ํšจ๊ณผํฌ๊ธฐ: Kendall's W
n_conditions = 3
W = chi2_stat / (n_subjects * (n_conditions - 1))
print(f"  Kendall's W = {W:.4f}")

์—ฐ์Šต ๋ฌธ์ œ

๋ฌธ์ œ 1: ์ผ์› ANOVA

์„ธ ๊ฐ€์ง€ ๋น„๋ฃŒ(A, B, C)๊ฐ€ ์‹๋ฌผ ์„ฑ์žฅ์— ๋ฏธ์น˜๋Š” ํšจ๊ณผ๋ฅผ ๋ถ„์„ํ•˜์‹œ์˜ค.

fertilizer_A = [20, 22, 19, 24, 25, 23, 21, 22, 26, 24]
fertilizer_B = [28, 30, 27, 29, 31, 28, 30, 29, 32, 31]
fertilizer_C = [25, 27, 26, 28, 24, 26, 27, 25, 29, 26]

๋ฌธ์ œ 2: ์ด์› ANOVA

์„ฑ๋ณ„(๋‚จ/์—ฌ) ร— ํ•™์Šต๋ฐฉ๋ฒ•(์˜จ๋ผ์ธ/์˜คํ”„๋ผ์ธ)์˜ ํšจ๊ณผ๋ฅผ ๋ถ„์„ํ•˜์‹œ์˜ค. - ์ฃผํšจ๊ณผ์™€ ์ƒํ˜ธ์ž‘์šฉ ํšจ๊ณผ๋ฅผ ํ•ด์„ํ•˜์‹œ์˜ค. - ์ƒํ˜ธ์ž‘์šฉ ํ”Œ๋กฏ์„ ๊ทธ๋ฆฌ์‹œ์˜ค.

๋ฌธ์ œ 3: ์‚ฌํ›„๊ฒ€์ •

๋ฌธ์ œ 1์˜ ๊ฒฐ๊ณผ๊ฐ€ ์œ ์˜ํ•  ๊ฒฝ์šฐ, Tukey HSD ์‚ฌํ›„๊ฒ€์ •์„ ์ˆ˜ํ–‰ํ•˜๊ณ  ํ•ด์„ํ•˜์‹œ์˜ค.


์ •๋ฆฌ

ANOVA ์œ ํ˜• ์„ค๊ณ„ Python ํ•จ์ˆ˜
์ผ์›๋ฐฐ์น˜ 1 ์š”์ธ, k ์ˆ˜์ค€ stats.f_oneway(), pg.anova()
์ด์›๋ฐฐ์น˜ 2 ์š”์ธ ols() + anova_lm()
๋ฐ˜๋ณต์ธก์ • ํ”ผํ—˜์ž ๋‚ด ์š”์ธ pg.rm_anova()
ํ˜ผํ•ฉ์„ค๊ณ„ ํ”ผํ—˜์ž ๊ฐ„+๋‚ด pg.mixed_anova()
Kruskal-Wallis ๋น„๋ชจ์ˆ˜ ์ผ์› stats.kruskal()
Friedman ๋น„๋ชจ์ˆ˜ ๋ฐ˜๋ณต์ธก์ • stats.friedmanchisquare()
์‚ฌํ›„๊ฒ€์ • ํŠน์ง• ํ•จ์ˆ˜
Tukey HSD ๋ชจ๋“  ์Œ ๋น„๊ต pairwise_tukeyhsd()
Bonferroni ๋ณด์ˆ˜์  pg.pairwise_tests(padjust='bonf')
Dunnett ๋Œ€์กฐ๊ตฐ ๋น„๊ต stats.dunnett()
Games-Howell ๋“ฑ๋ถ„์‚ฐ X pg.pairwise_gameshowell()
to navigate between lessons