04_statistics.py

Download
python 388 lines 12.2 KB
  1"""
  2ํ†ต๊ณ„ ๋ถ„์„ (Statistical Analysis)
  3Basic Statistical Analysis with Python
  4
  5๊ธฐ์ดˆ ํ†ต๊ณ„ ๋ถ„์„ ๋ฐฉ๋ฒ•์„ ๋‹ค๋ฃน๋‹ˆ๋‹ค.
  6"""
  7
  8import numpy as np
  9import pandas as pd
 10from scipy import stats
 11
 12
 13# =============================================================================
 14# 1. ๊ธฐ์ˆ  ํ†ต๊ณ„ (Descriptive Statistics)
 15# =============================================================================
 16def descriptive_stats():
 17    """๊ธฐ์ˆ  ํ†ต๊ณ„๋Ÿ‰"""
 18    print("\n[1] ๊ธฐ์ˆ  ํ†ต๊ณ„๋Ÿ‰")
 19    print("=" * 50)
 20
 21    np.random.seed(42)
 22    data = np.random.normal(100, 15, 1000)  # ํ‰๊ท  100, ํ‘œ์ค€ํŽธ์ฐจ 15
 23
 24    print(f"๋ฐ์ดํ„ฐ ํฌ๊ธฐ: {len(data)}")
 25    print(f"\n์ค‘์‹ฌ ๊ฒฝํ–ฅ:")
 26    print(f"  ํ‰๊ท  (Mean):    {np.mean(data):.2f}")
 27    print(f"  ์ค‘์•™๊ฐ’ (Median): {np.median(data):.2f}")
 28    print(f"  ์ตœ๋นˆ๊ฐ’ (Mode):   {stats.mode(data.round(), keepdims=False).mode:.2f}")
 29
 30    print(f"\n์‚ฐํฌ๋„:")
 31    print(f"  ๋ถ„์‚ฐ (Variance): {np.var(data, ddof=1):.2f}")
 32    print(f"  ํ‘œ์ค€ํŽธ์ฐจ (Std):  {np.std(data, ddof=1):.2f}")
 33    print(f"  ๋ฒ”์œ„ (Range):    {np.ptp(data):.2f}")
 34    print(f"  IQR:             {stats.iqr(data):.2f}")
 35
 36    print(f"\n๋ถ„์œ„์ˆ˜:")
 37    percentiles = [25, 50, 75, 90, 95, 99]
 38    for p in percentiles:
 39        print(f"  {p}th percentile: {np.percentile(data, p):.2f}")
 40
 41    print(f"\nํ˜•ํƒœ:")
 42    print(f"  ์™œ๋„ (Skewness):  {stats.skew(data):.4f}")
 43    print(f"  ์ฒจ๋„ (Kurtosis):  {stats.kurtosis(data):.4f}")
 44
 45
 46# =============================================================================
 47# 2. ์ƒ๊ด€ ๋ถ„์„ (Correlation Analysis)
 48# =============================================================================
 49def correlation_analysis():
 50    """์ƒ๊ด€ ๋ถ„์„"""
 51    print("\n[2] ์ƒ๊ด€ ๋ถ„์„")
 52    print("=" * 50)
 53
 54    np.random.seed(42)
 55    n = 100
 56
 57    # ์ƒ๊ด€๋œ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
 58    x = np.random.randn(n)
 59    y = 2 * x + np.random.randn(n) * 0.5  # ๊ฐ•ํ•œ ์–‘์˜ ์ƒ๊ด€
 60    z = -0.5 * x + np.random.randn(n)     # ์•ฝํ•œ ์Œ์˜ ์ƒ๊ด€
 61    w = np.random.randn(n)                 # ๋ฌด์ƒ๊ด€
 62
 63    # ํ”ผ์–ด์Šจ ์ƒ๊ด€๊ณ„์ˆ˜
 64    print("ํ”ผ์–ด์Šจ ์ƒ๊ด€๊ณ„์ˆ˜ (Pearson):")
 65    corr_xy, p_xy = stats.pearsonr(x, y)
 66    corr_xz, p_xz = stats.pearsonr(x, z)
 67    corr_xw, p_xw = stats.pearsonr(x, w)
 68
 69    print(f"  x-y: r = {corr_xy:.4f}, p = {p_xy:.4e}")
 70    print(f"  x-z: r = {corr_xz:.4f}, p = {p_xz:.4e}")
 71    print(f"  x-w: r = {corr_xw:.4f}, p = {p_xw:.4e}")
 72
 73    # ์Šคํ”ผ์–ด๋งŒ ์ˆœ์œ„ ์ƒ๊ด€๊ณ„์ˆ˜
 74    print("\n์Šคํ”ผ์–ด๋งŒ ์ˆœ์œ„ ์ƒ๊ด€๊ณ„์ˆ˜ (Spearman):")
 75    corr_s, p_s = stats.spearmanr(x, y)
 76    print(f"  x-y: ฯ = {corr_s:.4f}, p = {p_s:.4e}")
 77
 78    # DataFrame ์ƒ๊ด€ ํ–‰๋ ฌ
 79    df = pd.DataFrame({'x': x, 'y': y, 'z': z, 'w': w})
 80    print("\n์ƒ๊ด€ ํ–‰๋ ฌ:")
 81    print(df.corr().round(4))
 82
 83
 84# =============================================================================
 85# 3. ๊ฐ€์„ค ๊ฒ€์ • ๊ธฐ์ดˆ
 86# =============================================================================
 87def hypothesis_testing():
 88    """๊ฐ€์„ค ๊ฒ€์ •"""
 89    print("\n[3] ๊ฐ€์„ค ๊ฒ€์ • ๊ธฐ์ดˆ")
 90    print("=" * 50)
 91
 92    np.random.seed(42)
 93
 94    # ๋‹จ์ผ ํ‘œ๋ณธ t-๊ฒ€์ •
 95    print("\n[๋‹จ์ผ ํ‘œ๋ณธ t-๊ฒ€์ •]")
 96    sample = np.random.normal(105, 15, 50)  # ์‹ค์ œ ํ‰๊ท  105
 97    t_stat, p_value = stats.ttest_1samp(sample, 100)  # H0: ฮผ = 100
 98
 99    print(f"ํ‘œ๋ณธ ํ‰๊ท : {np.mean(sample):.2f}")
100    print(f"H0: ฮผ = 100")
101    print(f"t-ํ†ต๊ณ„๋Ÿ‰: {t_stat:.4f}")
102    print(f"p-value: {p_value:.4f}")
103    print(f"๊ฒฐ๋ก : {'H0 ๊ธฐ๊ฐ' if p_value < 0.05 else 'H0 ์ฑ„ํƒ'} (ฮฑ=0.05)")
104
105    # ๋…๋ฆฝ ํ‘œ๋ณธ t-๊ฒ€์ •
106    print("\n[๋…๋ฆฝ ํ‘œ๋ณธ t-๊ฒ€์ •]")
107    group1 = np.random.normal(100, 10, 50)
108    group2 = np.random.normal(105, 10, 50)
109
110    t_stat, p_value = stats.ttest_ind(group1, group2)
111
112    print(f"๊ทธ๋ฃน1 ํ‰๊ท : {np.mean(group1):.2f}")
113    print(f"๊ทธ๋ฃน2 ํ‰๊ท : {np.mean(group2):.2f}")
114    print(f"H0: ฮผ1 = ฮผ2")
115    print(f"t-ํ†ต๊ณ„๋Ÿ‰: {t_stat:.4f}")
116    print(f"p-value: {p_value:.4f}")
117    print(f"๊ฒฐ๋ก : {'H0 ๊ธฐ๊ฐ' if p_value < 0.05 else 'H0 ์ฑ„ํƒ'} (ฮฑ=0.05)")
118
119    # ๋Œ€์‘ ํ‘œ๋ณธ t-๊ฒ€์ •
120    print("\n[๋Œ€์‘ ํ‘œ๋ณธ t-๊ฒ€์ •]")
121    before = np.random.normal(100, 10, 30)
122    after = before + np.random.normal(5, 3, 30)  # ํ‰๊ท  5 ์ฆ๊ฐ€
123
124    t_stat, p_value = stats.ttest_rel(before, after)
125
126    print(f"์‚ฌ์ „ ํ‰๊ท : {np.mean(before):.2f}")
127    print(f"์‚ฌํ›„ ํ‰๊ท : {np.mean(after):.2f}")
128    print(f"H0: ฮผ_์ฐจ์ด = 0")
129    print(f"t-ํ†ต๊ณ„๋Ÿ‰: {t_stat:.4f}")
130    print(f"p-value: {p_value:.4f}")
131    print(f"๊ฒฐ๋ก : {'H0 ๊ธฐ๊ฐ' if p_value < 0.05 else 'H0 ์ฑ„ํƒ'} (ฮฑ=0.05)")
132
133
134# =============================================================================
135# 4. ์นด์ด์ œ๊ณฑ ๊ฒ€์ •
136# =============================================================================
137def chi_square_test():
138    """์นด์ด์ œ๊ณฑ ๊ฒ€์ •"""
139    print("\n[4] ์นด์ด์ œ๊ณฑ ๊ฒ€์ •")
140    print("=" * 50)
141
142    # ์ ํ•ฉ๋„ ๊ฒ€์ •
143    print("\n[์ ํ•ฉ๋„ ๊ฒ€์ •]")
144    observed = np.array([18, 22, 20, 15, 25])  # ๊ด€์ธก ๋นˆ๋„
145    expected = np.array([20, 20, 20, 20, 20])  # ๊ธฐ๋Œ€ ๋นˆ๋„
146
147    chi2, p_value = stats.chisquare(observed, expected)
148
149    print(f"๊ด€์ธก๊ฐ’: {observed}")
150    print(f"๊ธฐ๋Œ€๊ฐ’: {expected}")
151    print(f"ฯ‡ยฒ = {chi2:.4f}")
152    print(f"p-value = {p_value:.4f}")
153    print(f"๊ฒฐ๋ก : {'๋ถ„ํฌ ๋‹ค๋ฆ„' if p_value < 0.05 else '๋ถ„ํฌ ๊ฐ™์Œ'}")
154
155    # ๋…๋ฆฝ์„ฑ ๊ฒ€์ •
156    print("\n[๋…๋ฆฝ์„ฑ ๊ฒ€์ • (๊ต์ฐจํ‘œ)]")
157    contingency_table = np.array([
158        [30, 20, 10],  # ๊ทธ๋ฃน A
159        [15, 25, 20],  # ๊ทธ๋ฃน B
160        [25, 15, 25]   # ๊ทธ๋ฃน C
161    ])
162
163    print("๊ต์ฐจํ‘œ:")
164    print(contingency_table)
165
166    chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
167
168    print(f"\nฯ‡ยฒ = {chi2:.4f}")
169    print(f"์ž์œ ๋„ = {dof}")
170    print(f"p-value = {p_value:.4f}")
171    print(f"๊ฒฐ๋ก : {'๋…๋ฆฝ ์•„๋‹˜' if p_value < 0.05 else '๋…๋ฆฝ'}")
172
173
174# =============================================================================
175# 5. ๋ถ„์‚ฐ ๋ถ„์„ (ANOVA)
176# =============================================================================
177def anova_test():
178    """ANOVA"""
179    print("\n[5] ๋ถ„์‚ฐ ๋ถ„์„ (ANOVA)")
180    print("=" * 50)
181
182    np.random.seed(42)
183
184    # ์„ธ ๊ทธ๋ฃน ๋ฐ์ดํ„ฐ
185    group1 = np.random.normal(100, 10, 30)
186    group2 = np.random.normal(105, 10, 30)
187    group3 = np.random.normal(110, 10, 30)
188
189    print(f"๊ทธ๋ฃน1 ํ‰๊ท : {np.mean(group1):.2f}")
190    print(f"๊ทธ๋ฃน2 ํ‰๊ท : {np.mean(group2):.2f}")
191    print(f"๊ทธ๋ฃน3 ํ‰๊ท : {np.mean(group3):.2f}")
192
193    # ์ผ์› ๋ถ„์‚ฐ ๋ถ„์„
194    f_stat, p_value = stats.f_oneway(group1, group2, group3)
195
196    print(f"\n์ผ์› ๋ถ„์‚ฐ ๋ถ„์„ (One-way ANOVA)")
197    print(f"H0: ฮผ1 = ฮผ2 = ฮผ3")
198    print(f"F-ํ†ต๊ณ„๋Ÿ‰: {f_stat:.4f}")
199    print(f"p-value: {p_value:.4f}")
200    print(f"๊ฒฐ๋ก : {'๊ทธ๋ฃน ๊ฐ„ ์ฐจ์ด ์žˆ์Œ' if p_value < 0.05 else '๊ทธ๋ฃน ๊ฐ„ ์ฐจ์ด ์—†์Œ'}")
201
202    # Kruskal-Wallis (๋น„๋ชจ์ˆ˜)
203    print("\n[๋น„๋ชจ์ˆ˜ ๊ฒ€์ •: Kruskal-Wallis]")
204    h_stat, p_value = stats.kruskal(group1, group2, group3)
205    print(f"H-ํ†ต๊ณ„๋Ÿ‰: {h_stat:.4f}")
206    print(f"p-value: {p_value:.4f}")
207
208
209# =============================================================================
210# 6. ์ •๊ทœ์„ฑ ๊ฒ€์ •
211# =============================================================================
212def normality_test():
213    """์ •๊ทœ์„ฑ ๊ฒ€์ •"""
214    print("\n[6] ์ •๊ทœ์„ฑ ๊ฒ€์ •")
215    print("=" * 50)
216
217    np.random.seed(42)
218
219    # ์ •๊ทœ ๋ถ„ํฌ ๋ฐ์ดํ„ฐ
220    normal_data = np.random.normal(0, 1, 100)
221
222    # ๋น„์ •๊ทœ ๋ถ„ํฌ ๋ฐ์ดํ„ฐ (์ง€์ˆ˜ ๋ถ„ํฌ)
223    skewed_data = np.random.exponential(2, 100)
224
225    print("[์ •๊ทœ ๋ถ„ํฌ ๋ฐ์ดํ„ฐ]")
226    print(f"์™œ๋„: {stats.skew(normal_data):.4f}")
227    print(f"์ฒจ๋„: {stats.kurtosis(normal_data):.4f}")
228
229    # Shapiro-Wilk ๊ฒ€์ •
230    stat, p = stats.shapiro(normal_data)
231    print(f"Shapiro-Wilk: W={stat:.4f}, p={p:.4f}")
232
233    # Kolmogorov-Smirnov ๊ฒ€์ •
234    stat, p = stats.kstest(normal_data, 'norm')
235    print(f"K-S ๊ฒ€์ •: D={stat:.4f}, p={p:.4f}")
236
237    print(f"\n๊ฒฐ๋ก : {'์ •๊ทœ ๋ถ„ํฌ' if p > 0.05 else '์ •๊ทœ ๋ถ„ํฌ ์•„๋‹˜'}")
238
239    print("\n[๋น„์ •๊ทœ ๋ถ„ํฌ ๋ฐ์ดํ„ฐ (์ง€์ˆ˜ ๋ถ„ํฌ)]")
240    print(f"์™œ๋„: {stats.skew(skewed_data):.4f}")
241    print(f"์ฒจ๋„: {stats.kurtosis(skewed_data):.4f}")
242
243    stat, p = stats.shapiro(skewed_data)
244    print(f"Shapiro-Wilk: W={stat:.4f}, p={p:.4f}")
245    print(f"๊ฒฐ๋ก : {'์ •๊ทœ ๋ถ„ํฌ' if p > 0.05 else '์ •๊ทœ ๋ถ„ํฌ ์•„๋‹˜'}")
246
247
248# =============================================================================
249# 7. ์‹ ๋ขฐ ๊ตฌ๊ฐ„
250# =============================================================================
251def confidence_interval():
252    """์‹ ๋ขฐ ๊ตฌ๊ฐ„"""
253    print("\n[7] ์‹ ๋ขฐ ๊ตฌ๊ฐ„")
254    print("=" * 50)
255
256    np.random.seed(42)
257    sample = np.random.normal(100, 15, 50)
258
259    mean = np.mean(sample)
260    sem = stats.sem(sample)  # ํ‘œ์ค€ ์˜ค์ฐจ
261
262    # 95% ์‹ ๋ขฐ ๊ตฌ๊ฐ„
263    ci_95 = stats.t.interval(0.95, len(sample)-1, loc=mean, scale=sem)
264    ci_99 = stats.t.interval(0.99, len(sample)-1, loc=mean, scale=sem)
265
266    print(f"ํ‘œ๋ณธ ํฌ๊ธฐ: {len(sample)}")
267    print(f"ํ‘œ๋ณธ ํ‰๊ท : {mean:.2f}")
268    print(f"ํ‘œ์ค€ ์˜ค์ฐจ: {sem:.2f}")
269    print(f"\n95% ์‹ ๋ขฐ ๊ตฌ๊ฐ„: ({ci_95[0]:.2f}, {ci_95[1]:.2f})")
270    print(f"99% ์‹ ๋ขฐ ๊ตฌ๊ฐ„: ({ci_99[0]:.2f}, {ci_99[1]:.2f})")
271
272    # ๋น„์œจ์˜ ์‹ ๋ขฐ ๊ตฌ๊ฐ„
273    print("\n[๋น„์œจ์˜ ์‹ ๋ขฐ ๊ตฌ๊ฐ„]")
274    n_success = 70
275    n_total = 100
276    p_hat = n_success / n_total
277    se_p = np.sqrt(p_hat * (1 - p_hat) / n_total)
278    z_95 = 1.96
279
280    ci_low = p_hat - z_95 * se_p
281    ci_high = p_hat + z_95 * se_p
282
283    print(f"์„ฑ๊ณต: {n_success}/{n_total} = {p_hat:.2f}")
284    print(f"95% ์‹ ๋ขฐ ๊ตฌ๊ฐ„: ({ci_low:.4f}, {ci_high:.4f})")
285
286
287# =============================================================================
288# 8. ํšจ๊ณผ ํฌ๊ธฐ
289# =============================================================================
290def effect_size():
291    """ํšจ๊ณผ ํฌ๊ธฐ ๊ณ„์‚ฐ"""
292    print("\n[8] ํšจ๊ณผ ํฌ๊ธฐ")
293    print("=" * 50)
294
295    np.random.seed(42)
296
297    group1 = np.random.normal(100, 15, 50)
298    group2 = np.random.normal(110, 15, 50)
299
300    # Cohen's d
301    def cohens_d(g1, g2):
302        n1, n2 = len(g1), len(g2)
303        var1, var2 = np.var(g1, ddof=1), np.var(g2, ddof=1)
304        pooled_std = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
305        return (np.mean(g1) - np.mean(g2)) / pooled_std
306
307    d = cohens_d(group2, group1)
308
309    print(f"๊ทธ๋ฃน1 ํ‰๊ท : {np.mean(group1):.2f}")
310    print(f"๊ทธ๋ฃน2 ํ‰๊ท : {np.mean(group2):.2f}")
311    print(f"\nCohen's d: {d:.4f}")
312
313    # ํšจ๊ณผ ํฌ๊ธฐ ํ•ด์„
314    if abs(d) < 0.2:
315        interpretation = "ํšจ๊ณผ ์—†์Œ"
316    elif abs(d) < 0.5:
317        interpretation = "์ž‘์€ ํšจ๊ณผ"
318    elif abs(d) < 0.8:
319        interpretation = "์ค‘๊ฐ„ ํšจ๊ณผ"
320    else:
321        interpretation = "ํฐ ํšจ๊ณผ"
322
323    print(f"ํ•ด์„: {interpretation}")
324
325    # ์ƒ๊ด€๊ณ„์ˆ˜๋ฅผ ํšจ๊ณผ ํฌ๊ธฐ๋กœ
326    x = np.random.randn(100)
327    y = 0.5 * x + np.random.randn(100) * 0.5
328    r, _ = stats.pearsonr(x, y)
329    r_squared = r ** 2
330
331    print(f"\n์ƒ๊ด€๊ณ„์ˆ˜ r: {r:.4f}")
332    print(f"๊ฒฐ์ •๊ณ„์ˆ˜ rยฒ: {r_squared:.4f}")
333    print(f"(์„ค๋ช… ๋ถ„์‚ฐ ๋น„์œจ: {r_squared*100:.1f}%)")
334
335
336# =============================================================================
337# ๋ฉ”์ธ
338# =============================================================================
339def main():
340    print("=" * 60)
341    print("ํ†ต๊ณ„ ๋ถ„์„ ์˜ˆ์ œ")
342    print("=" * 60)
343
344    descriptive_stats()
345    correlation_analysis()
346    hypothesis_testing()
347    chi_square_test()
348    anova_test()
349    normality_test()
350    confidence_interval()
351    effect_size()
352
353    print("\n" + "=" * 60)
354    print("ํ†ต๊ณ„ ๋ถ„์„ ํ•ต์‹ฌ ์ •๋ฆฌ")
355    print("=" * 60)
356    print("""
357    ๊ธฐ์ˆ  ํ†ต๊ณ„:
358    - ์ค‘์‹ฌ: ํ‰๊ท , ์ค‘์•™๊ฐ’, ์ตœ๋นˆ๊ฐ’
359    - ์‚ฐํฌ: ๋ถ„์‚ฐ, ํ‘œ์ค€ํŽธ์ฐจ, IQR
360    - ํ˜•ํƒœ: ์™œ๋„, ์ฒจ๋„
361
362    ์ถ”๋ก  ํ†ต๊ณ„:
363    - t-๊ฒ€์ •: ํ‰๊ท  ๋น„๊ต (1ํ‘œ๋ณธ, ๋…๋ฆฝ, ๋Œ€์‘)
364    - ANOVA: 3๊ฐœ ์ด์ƒ ๊ทธ๋ฃน ํ‰๊ท  ๋น„๊ต
365    - ์นด์ด์ œ๊ณฑ: ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ๊ด€๊ณ„
366    - ์ƒ๊ด€๋ถ„์„: ์—ฐ์†ํ˜• ๋ณ€์ˆ˜ ๊ด€๊ณ„
367
368    ๊ฐ€์„ค ๊ฒ€์ • ์ ˆ์ฐจ:
369    1. ๊ท€๋ฌด๊ฐ€์„ค(H0)๊ณผ ๋Œ€๋ฆฝ๊ฐ€์„ค(H1) ์„ค์ •
370    2. ์œ ์˜์ˆ˜์ค€ ๊ฒฐ์ • (๋ณดํ†ต ฮฑ=0.05)
371    3. ๊ฒ€์ • ํ†ต๊ณ„๋Ÿ‰ ๊ณ„์‚ฐ
372    4. p-value์™€ ์œ ์˜์ˆ˜์ค€ ๋น„๊ต
373    5. ๊ฒฐ๋ก  ๋„์ถœ
374
375    p-value ํ•ด์„:
376    - p < 0.05: ํ†ต๊ณ„์ ์œผ๋กœ ์œ ์˜ํ•จ
377    - p โ‰ฅ 0.05: ํ†ต๊ณ„์ ์œผ๋กœ ์œ ์˜ํ•˜์ง€ ์•Š์Œ
378
379    ์ฃผ์˜:
380    - ํ†ต๊ณ„์  ์œ ์˜์„ฑ โ‰  ์‹ค์งˆ์  ์ค‘์š”์„ฑ
381    - ํšจ๊ณผ ํฌ๊ธฐ๋„ ํ•จ๊ป˜ ๋ณด๊ณ 
382    - ๋‹ค์ค‘ ๋น„๊ต ์‹œ ๋ณด์ • ํ•„์š” (Bonferroni ๋“ฑ)
383    """)
384
385
386if __name__ == "__main__":
387    main()