1"""
2ํต๊ณ ๋ถ์ (Statistical Analysis)
3Basic Statistical Analysis with Python
4
5๊ธฐ์ด ํต๊ณ ๋ถ์ ๋ฐฉ๋ฒ์ ๋ค๋ฃน๋๋ค.
6"""
7
8import numpy as np
9import pandas as pd
10from scipy import stats
11
12
13# =============================================================================
14# 1. ๊ธฐ์ ํต๊ณ (Descriptive Statistics)
15# =============================================================================
16def descriptive_stats():
17 """๊ธฐ์ ํต๊ณ๋"""
18 print("\n[1] ๊ธฐ์ ํต๊ณ๋")
19 print("=" * 50)
20
21 np.random.seed(42)
22 data = np.random.normal(100, 15, 1000) # ํ๊ท 100, ํ์คํธ์ฐจ 15
23
24 print(f"๋ฐ์ดํฐ ํฌ๊ธฐ: {len(data)}")
25 print(f"\n์ค์ฌ ๊ฒฝํฅ:")
26 print(f" ํ๊ท (Mean): {np.mean(data):.2f}")
27 print(f" ์ค์๊ฐ (Median): {np.median(data):.2f}")
28 print(f" ์ต๋น๊ฐ (Mode): {stats.mode(data.round(), keepdims=False).mode:.2f}")
29
30 print(f"\n์ฐํฌ๋:")
31 print(f" ๋ถ์ฐ (Variance): {np.var(data, ddof=1):.2f}")
32 print(f" ํ์คํธ์ฐจ (Std): {np.std(data, ddof=1):.2f}")
33 print(f" ๋ฒ์ (Range): {np.ptp(data):.2f}")
34 print(f" IQR: {stats.iqr(data):.2f}")
35
36 print(f"\n๋ถ์์:")
37 percentiles = [25, 50, 75, 90, 95, 99]
38 for p in percentiles:
39 print(f" {p}th percentile: {np.percentile(data, p):.2f}")
40
41 print(f"\nํํ:")
42 print(f" ์๋ (Skewness): {stats.skew(data):.4f}")
43 print(f" ์ฒจ๋ (Kurtosis): {stats.kurtosis(data):.4f}")
44
45
46# =============================================================================
47# 2. ์๊ด ๋ถ์ (Correlation Analysis)
48# =============================================================================
49def correlation_analysis():
50 """์๊ด ๋ถ์"""
51 print("\n[2] ์๊ด ๋ถ์")
52 print("=" * 50)
53
54 np.random.seed(42)
55 n = 100
56
57 # ์๊ด๋ ๋ฐ์ดํฐ ์์ฑ
58 x = np.random.randn(n)
59 y = 2 * x + np.random.randn(n) * 0.5 # ๊ฐํ ์์ ์๊ด
60 z = -0.5 * x + np.random.randn(n) # ์ฝํ ์์ ์๊ด
61 w = np.random.randn(n) # ๋ฌด์๊ด
62
63 # ํผ์ด์จ ์๊ด๊ณ์
64 print("ํผ์ด์จ ์๊ด๊ณ์ (Pearson):")
65 corr_xy, p_xy = stats.pearsonr(x, y)
66 corr_xz, p_xz = stats.pearsonr(x, z)
67 corr_xw, p_xw = stats.pearsonr(x, w)
68
69 print(f" x-y: r = {corr_xy:.4f}, p = {p_xy:.4e}")
70 print(f" x-z: r = {corr_xz:.4f}, p = {p_xz:.4e}")
71 print(f" x-w: r = {corr_xw:.4f}, p = {p_xw:.4e}")
72
73 # ์คํผ์ด๋ง ์์ ์๊ด๊ณ์
74 print("\n์คํผ์ด๋ง ์์ ์๊ด๊ณ์ (Spearman):")
75 corr_s, p_s = stats.spearmanr(x, y)
76 print(f" x-y: ฯ = {corr_s:.4f}, p = {p_s:.4e}")
77
78 # DataFrame ์๊ด ํ๋ ฌ
79 df = pd.DataFrame({'x': x, 'y': y, 'z': z, 'w': w})
80 print("\n์๊ด ํ๋ ฌ:")
81 print(df.corr().round(4))
82
83
84# =============================================================================
85# 3. ๊ฐ์ค ๊ฒ์ ๊ธฐ์ด
86# =============================================================================
87def hypothesis_testing():
88 """๊ฐ์ค ๊ฒ์ """
89 print("\n[3] ๊ฐ์ค ๊ฒ์ ๊ธฐ์ด")
90 print("=" * 50)
91
92 np.random.seed(42)
93
94 # ๋จ์ผ ํ๋ณธ t-๊ฒ์
95 print("\n[๋จ์ผ ํ๋ณธ t-๊ฒ์ ]")
96 sample = np.random.normal(105, 15, 50) # ์ค์ ํ๊ท 105
97 t_stat, p_value = stats.ttest_1samp(sample, 100) # H0: ฮผ = 100
98
99 print(f"ํ๋ณธ ํ๊ท : {np.mean(sample):.2f}")
100 print(f"H0: ฮผ = 100")
101 print(f"t-ํต๊ณ๋: {t_stat:.4f}")
102 print(f"p-value: {p_value:.4f}")
103 print(f"๊ฒฐ๋ก : {'H0 ๊ธฐ๊ฐ' if p_value < 0.05 else 'H0 ์ฑํ'} (ฮฑ=0.05)")
104
105 # ๋
๋ฆฝ ํ๋ณธ t-๊ฒ์
106 print("\n[๋
๋ฆฝ ํ๋ณธ t-๊ฒ์ ]")
107 group1 = np.random.normal(100, 10, 50)
108 group2 = np.random.normal(105, 10, 50)
109
110 t_stat, p_value = stats.ttest_ind(group1, group2)
111
112 print(f"๊ทธ๋ฃน1 ํ๊ท : {np.mean(group1):.2f}")
113 print(f"๊ทธ๋ฃน2 ํ๊ท : {np.mean(group2):.2f}")
114 print(f"H0: ฮผ1 = ฮผ2")
115 print(f"t-ํต๊ณ๋: {t_stat:.4f}")
116 print(f"p-value: {p_value:.4f}")
117 print(f"๊ฒฐ๋ก : {'H0 ๊ธฐ๊ฐ' if p_value < 0.05 else 'H0 ์ฑํ'} (ฮฑ=0.05)")
118
119 # ๋์ ํ๋ณธ t-๊ฒ์
120 print("\n[๋์ ํ๋ณธ t-๊ฒ์ ]")
121 before = np.random.normal(100, 10, 30)
122 after = before + np.random.normal(5, 3, 30) # ํ๊ท 5 ์ฆ๊ฐ
123
124 t_stat, p_value = stats.ttest_rel(before, after)
125
126 print(f"์ฌ์ ํ๊ท : {np.mean(before):.2f}")
127 print(f"์ฌํ ํ๊ท : {np.mean(after):.2f}")
128 print(f"H0: ฮผ_์ฐจ์ด = 0")
129 print(f"t-ํต๊ณ๋: {t_stat:.4f}")
130 print(f"p-value: {p_value:.4f}")
131 print(f"๊ฒฐ๋ก : {'H0 ๊ธฐ๊ฐ' if p_value < 0.05 else 'H0 ์ฑํ'} (ฮฑ=0.05)")
132
133
134# =============================================================================
135# 4. ์นด์ด์ ๊ณฑ ๊ฒ์
136# =============================================================================
137def chi_square_test():
138 """์นด์ด์ ๊ณฑ ๊ฒ์ """
139 print("\n[4] ์นด์ด์ ๊ณฑ ๊ฒ์ ")
140 print("=" * 50)
141
142 # ์ ํฉ๋ ๊ฒ์
143 print("\n[์ ํฉ๋ ๊ฒ์ ]")
144 observed = np.array([18, 22, 20, 15, 25]) # ๊ด์ธก ๋น๋
145 expected = np.array([20, 20, 20, 20, 20]) # ๊ธฐ๋ ๋น๋
146
147 chi2, p_value = stats.chisquare(observed, expected)
148
149 print(f"๊ด์ธก๊ฐ: {observed}")
150 print(f"๊ธฐ๋๊ฐ: {expected}")
151 print(f"ฯยฒ = {chi2:.4f}")
152 print(f"p-value = {p_value:.4f}")
153 print(f"๊ฒฐ๋ก : {'๋ถํฌ ๋ค๋ฆ' if p_value < 0.05 else '๋ถํฌ ๊ฐ์'}")
154
155 # ๋
๋ฆฝ์ฑ ๊ฒ์
156 print("\n[๋
๋ฆฝ์ฑ ๊ฒ์ (๊ต์ฐจํ)]")
157 contingency_table = np.array([
158 [30, 20, 10], # ๊ทธ๋ฃน A
159 [15, 25, 20], # ๊ทธ๋ฃน B
160 [25, 15, 25] # ๊ทธ๋ฃน C
161 ])
162
163 print("๊ต์ฐจํ:")
164 print(contingency_table)
165
166 chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
167
168 print(f"\nฯยฒ = {chi2:.4f}")
169 print(f"์์ ๋ = {dof}")
170 print(f"p-value = {p_value:.4f}")
171 print(f"๊ฒฐ๋ก : {'๋
๋ฆฝ ์๋' if p_value < 0.05 else '๋
๋ฆฝ'}")
172
173
174# =============================================================================
175# 5. ๋ถ์ฐ ๋ถ์ (ANOVA)
176# =============================================================================
177def anova_test():
178 """ANOVA"""
179 print("\n[5] ๋ถ์ฐ ๋ถ์ (ANOVA)")
180 print("=" * 50)
181
182 np.random.seed(42)
183
184 # ์ธ ๊ทธ๋ฃน ๋ฐ์ดํฐ
185 group1 = np.random.normal(100, 10, 30)
186 group2 = np.random.normal(105, 10, 30)
187 group3 = np.random.normal(110, 10, 30)
188
189 print(f"๊ทธ๋ฃน1 ํ๊ท : {np.mean(group1):.2f}")
190 print(f"๊ทธ๋ฃน2 ํ๊ท : {np.mean(group2):.2f}")
191 print(f"๊ทธ๋ฃน3 ํ๊ท : {np.mean(group3):.2f}")
192
193 # ์ผ์ ๋ถ์ฐ ๋ถ์
194 f_stat, p_value = stats.f_oneway(group1, group2, group3)
195
196 print(f"\n์ผ์ ๋ถ์ฐ ๋ถ์ (One-way ANOVA)")
197 print(f"H0: ฮผ1 = ฮผ2 = ฮผ3")
198 print(f"F-ํต๊ณ๋: {f_stat:.4f}")
199 print(f"p-value: {p_value:.4f}")
200 print(f"๊ฒฐ๋ก : {'๊ทธ๋ฃน ๊ฐ ์ฐจ์ด ์์' if p_value < 0.05 else '๊ทธ๋ฃน ๊ฐ ์ฐจ์ด ์์'}")
201
202 # Kruskal-Wallis (๋น๋ชจ์)
203 print("\n[๋น๋ชจ์ ๊ฒ์ : Kruskal-Wallis]")
204 h_stat, p_value = stats.kruskal(group1, group2, group3)
205 print(f"H-ํต๊ณ๋: {h_stat:.4f}")
206 print(f"p-value: {p_value:.4f}")
207
208
209# =============================================================================
210# 6. ์ ๊ท์ฑ ๊ฒ์
211# =============================================================================
212def normality_test():
213 """์ ๊ท์ฑ ๊ฒ์ """
214 print("\n[6] ์ ๊ท์ฑ ๊ฒ์ ")
215 print("=" * 50)
216
217 np.random.seed(42)
218
219 # ์ ๊ท ๋ถํฌ ๋ฐ์ดํฐ
220 normal_data = np.random.normal(0, 1, 100)
221
222 # ๋น์ ๊ท ๋ถํฌ ๋ฐ์ดํฐ (์ง์ ๋ถํฌ)
223 skewed_data = np.random.exponential(2, 100)
224
225 print("[์ ๊ท ๋ถํฌ ๋ฐ์ดํฐ]")
226 print(f"์๋: {stats.skew(normal_data):.4f}")
227 print(f"์ฒจ๋: {stats.kurtosis(normal_data):.4f}")
228
229 # Shapiro-Wilk ๊ฒ์
230 stat, p = stats.shapiro(normal_data)
231 print(f"Shapiro-Wilk: W={stat:.4f}, p={p:.4f}")
232
233 # Kolmogorov-Smirnov ๊ฒ์
234 stat, p = stats.kstest(normal_data, 'norm')
235 print(f"K-S ๊ฒ์ : D={stat:.4f}, p={p:.4f}")
236
237 print(f"\n๊ฒฐ๋ก : {'์ ๊ท ๋ถํฌ' if p > 0.05 else '์ ๊ท ๋ถํฌ ์๋'}")
238
239 print("\n[๋น์ ๊ท ๋ถํฌ ๋ฐ์ดํฐ (์ง์ ๋ถํฌ)]")
240 print(f"์๋: {stats.skew(skewed_data):.4f}")
241 print(f"์ฒจ๋: {stats.kurtosis(skewed_data):.4f}")
242
243 stat, p = stats.shapiro(skewed_data)
244 print(f"Shapiro-Wilk: W={stat:.4f}, p={p:.4f}")
245 print(f"๊ฒฐ๋ก : {'์ ๊ท ๋ถํฌ' if p > 0.05 else '์ ๊ท ๋ถํฌ ์๋'}")
246
247
248# =============================================================================
249# 7. ์ ๋ขฐ ๊ตฌ๊ฐ
250# =============================================================================
251def confidence_interval():
252 """์ ๋ขฐ ๊ตฌ๊ฐ"""
253 print("\n[7] ์ ๋ขฐ ๊ตฌ๊ฐ")
254 print("=" * 50)
255
256 np.random.seed(42)
257 sample = np.random.normal(100, 15, 50)
258
259 mean = np.mean(sample)
260 sem = stats.sem(sample) # ํ์ค ์ค์ฐจ
261
262 # 95% ์ ๋ขฐ ๊ตฌ๊ฐ
263 ci_95 = stats.t.interval(0.95, len(sample)-1, loc=mean, scale=sem)
264 ci_99 = stats.t.interval(0.99, len(sample)-1, loc=mean, scale=sem)
265
266 print(f"ํ๋ณธ ํฌ๊ธฐ: {len(sample)}")
267 print(f"ํ๋ณธ ํ๊ท : {mean:.2f}")
268 print(f"ํ์ค ์ค์ฐจ: {sem:.2f}")
269 print(f"\n95% ์ ๋ขฐ ๊ตฌ๊ฐ: ({ci_95[0]:.2f}, {ci_95[1]:.2f})")
270 print(f"99% ์ ๋ขฐ ๊ตฌ๊ฐ: ({ci_99[0]:.2f}, {ci_99[1]:.2f})")
271
272 # ๋น์จ์ ์ ๋ขฐ ๊ตฌ๊ฐ
273 print("\n[๋น์จ์ ์ ๋ขฐ ๊ตฌ๊ฐ]")
274 n_success = 70
275 n_total = 100
276 p_hat = n_success / n_total
277 se_p = np.sqrt(p_hat * (1 - p_hat) / n_total)
278 z_95 = 1.96
279
280 ci_low = p_hat - z_95 * se_p
281 ci_high = p_hat + z_95 * se_p
282
283 print(f"์ฑ๊ณต: {n_success}/{n_total} = {p_hat:.2f}")
284 print(f"95% ์ ๋ขฐ ๊ตฌ๊ฐ: ({ci_low:.4f}, {ci_high:.4f})")
285
286
287# =============================================================================
288# 8. ํจ๊ณผ ํฌ๊ธฐ
289# =============================================================================
290def effect_size():
291 """ํจ๊ณผ ํฌ๊ธฐ ๊ณ์ฐ"""
292 print("\n[8] ํจ๊ณผ ํฌ๊ธฐ")
293 print("=" * 50)
294
295 np.random.seed(42)
296
297 group1 = np.random.normal(100, 15, 50)
298 group2 = np.random.normal(110, 15, 50)
299
300 # Cohen's d
301 def cohens_d(g1, g2):
302 n1, n2 = len(g1), len(g2)
303 var1, var2 = np.var(g1, ddof=1), np.var(g2, ddof=1)
304 pooled_std = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
305 return (np.mean(g1) - np.mean(g2)) / pooled_std
306
307 d = cohens_d(group2, group1)
308
309 print(f"๊ทธ๋ฃน1 ํ๊ท : {np.mean(group1):.2f}")
310 print(f"๊ทธ๋ฃน2 ํ๊ท : {np.mean(group2):.2f}")
311 print(f"\nCohen's d: {d:.4f}")
312
313 # ํจ๊ณผ ํฌ๊ธฐ ํด์
314 if abs(d) < 0.2:
315 interpretation = "ํจ๊ณผ ์์"
316 elif abs(d) < 0.5:
317 interpretation = "์์ ํจ๊ณผ"
318 elif abs(d) < 0.8:
319 interpretation = "์ค๊ฐ ํจ๊ณผ"
320 else:
321 interpretation = "ํฐ ํจ๊ณผ"
322
323 print(f"ํด์: {interpretation}")
324
325 # ์๊ด๊ณ์๋ฅผ ํจ๊ณผ ํฌ๊ธฐ๋ก
326 x = np.random.randn(100)
327 y = 0.5 * x + np.random.randn(100) * 0.5
328 r, _ = stats.pearsonr(x, y)
329 r_squared = r ** 2
330
331 print(f"\n์๊ด๊ณ์ r: {r:.4f}")
332 print(f"๊ฒฐ์ ๊ณ์ rยฒ: {r_squared:.4f}")
333 print(f"(์ค๋ช
๋ถ์ฐ ๋น์จ: {r_squared*100:.1f}%)")
334
335
336# =============================================================================
337# ๋ฉ์ธ
338# =============================================================================
339def main():
340 print("=" * 60)
341 print("ํต๊ณ ๋ถ์ ์์ ")
342 print("=" * 60)
343
344 descriptive_stats()
345 correlation_analysis()
346 hypothesis_testing()
347 chi_square_test()
348 anova_test()
349 normality_test()
350 confidence_interval()
351 effect_size()
352
353 print("\n" + "=" * 60)
354 print("ํต๊ณ ๋ถ์ ํต์ฌ ์ ๋ฆฌ")
355 print("=" * 60)
356 print("""
357 ๊ธฐ์ ํต๊ณ:
358 - ์ค์ฌ: ํ๊ท , ์ค์๊ฐ, ์ต๋น๊ฐ
359 - ์ฐํฌ: ๋ถ์ฐ, ํ์คํธ์ฐจ, IQR
360 - ํํ: ์๋, ์ฒจ๋
361
362 ์ถ๋ก ํต๊ณ:
363 - t-๊ฒ์ : ํ๊ท ๋น๊ต (1ํ๋ณธ, ๋
๋ฆฝ, ๋์)
364 - ANOVA: 3๊ฐ ์ด์ ๊ทธ๋ฃน ํ๊ท ๋น๊ต
365 - ์นด์ด์ ๊ณฑ: ๋ฒ์ฃผํ ๋ณ์ ๊ด๊ณ
366 - ์๊ด๋ถ์: ์ฐ์ํ ๋ณ์ ๊ด๊ณ
367
368 ๊ฐ์ค ๊ฒ์ ์ ์ฐจ:
369 1. ๊ท๋ฌด๊ฐ์ค(H0)๊ณผ ๋๋ฆฝ๊ฐ์ค(H1) ์ค์
370 2. ์ ์์์ค ๊ฒฐ์ (๋ณดํต ฮฑ=0.05)
371 3. ๊ฒ์ ํต๊ณ๋ ๊ณ์ฐ
372 4. p-value์ ์ ์์์ค ๋น๊ต
373 5. ๊ฒฐ๋ก ๋์ถ
374
375 p-value ํด์:
376 - p < 0.05: ํต๊ณ์ ์ผ๋ก ์ ์ํจ
377 - p โฅ 0.05: ํต๊ณ์ ์ผ๋ก ์ ์ํ์ง ์์
378
379 ์ฃผ์:
380 - ํต๊ณ์ ์ ์์ฑ โ ์ค์ง์ ์ค์์ฑ
381 - ํจ๊ณผ ํฌ๊ธฐ๋ ํจ๊ป ๋ณด๊ณ
382 - ๋ค์ค ๋น๊ต ์ ๋ณด์ ํ์ (Bonferroni ๋ฑ)
383 """)
384
385
386if __name__ == "__main__":
387 main()