1"""
210_nonparametric.py
3
4Demonstrates nonparametric statistical methods:
5- Mann-Whitney U test
6- Wilcoxon signed-rank test
7- Kruskal-Wallis test
8- Kolmogorov-Smirnov test
9- Kernel density estimation
10- Bootstrap confidence intervals
11"""
12
13import numpy as np
14from scipy import stats
15
16try:
17 import matplotlib.pyplot as plt
18 HAS_PLT = True
19except ImportError:
20 HAS_PLT = False
21 print("matplotlib not available; skipping plots\n")
22
23
24def print_section(title):
25 """Print formatted section header."""
26 print("\n" + "=" * 70)
27 print(f" {title}")
28 print("=" * 70)
29
30
31def mann_whitney_test():
32 """Demonstrate Mann-Whitney U test."""
33 print_section("1. Mann-Whitney U Test")
34
35 print("Comparing two independent samples (non-normal distributions)")
36
37 # Generate skewed data
38 np.random.seed(42)
39 group1 = np.random.exponential(2, 40)
40 group2 = np.random.exponential(2.5, 35)
41
42 print(f"\nGroup 1: n={len(group1)}")
43 print(f" Mean: {np.mean(group1):.2f}")
44 print(f" Median: {np.median(group1):.2f}")
45 print(f" Std: {np.std(group1, ddof=1):.2f}")
46
47 print(f"\nGroup 2: n={len(group2)}")
48 print(f" Mean: {np.mean(group2):.2f}")
49 print(f" Median: {np.median(group2):.2f}")
50 print(f" Std: {np.std(group2, ddof=1):.2f}")
51
52 # Mann-Whitney U test
53 statistic, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')
54
55 print(f"\nMann-Whitney U test:")
56 print(f" U statistic: {statistic:.2f}")
57 print(f" p-value: {p_value:.6f}")
58
59 if p_value < 0.05:
60 print(f" Significant difference in distributions (p < 0.05)")
61 else:
62 print(f" No significant difference (p ≥ 0.05)")
63
64 # Compare with t-test (for comparison)
65 t_stat, t_p = stats.ttest_ind(group1, group2)
66
67 print(f"\nFor comparison, t-test p-value: {t_p:.6f}")
68 print(f" (t-test assumes normality; Mann-Whitney does not)")
69
70 if HAS_PLT:
71 fig, axes = plt.subplots(1, 2, figsize=(12, 5))
72
73 # Histograms
74 axes[0].hist(group1, bins=20, alpha=0.7, label='Group 1', density=True)
75 axes[0].hist(group2, bins=20, alpha=0.7, label='Group 2', density=True)
76 axes[0].axvline(np.median(group1), color='C0', linestyle='--', linewidth=2)
77 axes[0].axvline(np.median(group2), color='C1', linestyle='--', linewidth=2)
78 axes[0].set_xlabel('Value')
79 axes[0].set_ylabel('Density')
80 axes[0].set_title('Distribution Comparison')
81 axes[0].legend()
82 axes[0].grid(True, alpha=0.3)
83
84 # Box plots
85 axes[1].boxplot([group1, group2], labels=['Group 1', 'Group 2'])
86 axes[1].set_ylabel('Value')
87 axes[1].set_title('Box Plot Comparison')
88 axes[1].grid(True, alpha=0.3)
89
90 plt.tight_layout()
91 plt.savefig('/tmp/mann_whitney.png', dpi=100)
92 print("\n[Plot saved to /tmp/mann_whitney.png]")
93 plt.close()
94
95
96def wilcoxon_signed_rank():
97 """Demonstrate Wilcoxon signed-rank test."""
98 print_section("2. Wilcoxon Signed-Rank Test")
99
100 print("Paired comparison (non-normal differences)")
101
102 # Generate paired data
103 np.random.seed(123)
104 n = 30
105 before = np.random.gamma(3, 2, n)
106 treatment_effect = np.random.exponential(1.5, n)
107 after = before + treatment_effect
108
109 differences = after - before
110
111 print(f"\nPaired data: n={n}")
112 print(f"\nBefore:")
113 print(f" Median: {np.median(before):.2f}")
114 print(f"After:")
115 print(f" Median: {np.median(after):.2f}")
116
117 print(f"\nDifferences (after - before):")
118 print(f" Median: {np.median(differences):.2f}")
119 print(f" Mean: {np.mean(differences):.2f}")
120
121 # Wilcoxon signed-rank test
122 statistic, p_value = stats.wilcoxon(after, before, alternative='greater')
123
124 print(f"\nWilcoxon signed-rank test (H₁: after > before):")
125 print(f" Statistic: {statistic:.2f}")
126 print(f" p-value: {p_value:.6f}")
127
128 if p_value < 0.05:
129 print(f" Significant increase detected (p < 0.05)")
130 else:
131 print(f" No significant increase (p ≥ 0.05)")
132
133 # Compare with paired t-test
134 t_stat, t_p = stats.ttest_rel(after, before)
135
136 print(f"\nFor comparison, paired t-test p-value: {t_p/2:.6f}")
137 print(f" (one-sided)")
138
139 # Check normality of differences
140 shapiro_stat, shapiro_p = stats.shapiro(differences)
141
142 print(f"\nShapiro-Wilk test on differences:")
143 print(f" p-value: {shapiro_p:.6f}")
144 if shapiro_p < 0.05:
145 print(f" Differences not normally distributed (Wilcoxon appropriate)")
146
147
148def kruskal_wallis_test():
149 """Demonstrate Kruskal-Wallis test."""
150 print_section("3. Kruskal-Wallis Test")
151
152 print("Comparing multiple independent groups (nonparametric ANOVA)")
153
154 # Generate data from different distributions
155 np.random.seed(456)
156 group1 = np.random.lognormal(1, 0.5, 25)
157 group2 = np.random.lognormal(1.2, 0.5, 30)
158 group3 = np.random.lognormal(1.4, 0.5, 28)
159
160 print(f"\nGroup 1: n={len(group1)}, median={np.median(group1):.2f}")
161 print(f"Group 2: n={len(group2)}, median={np.median(group2):.2f}")
162 print(f"Group 3: n={len(group3)}, median={np.median(group3):.2f}")
163
164 # Kruskal-Wallis test
165 statistic, p_value = stats.kruskal(group1, group2, group3)
166
167 print(f"\nKruskal-Wallis test:")
168 print(f" H statistic: {statistic:.4f}")
169 print(f" p-value: {p_value:.6f}")
170
171 if p_value < 0.05:
172 print(f" Significant differences among groups (p < 0.05)")
173 else:
174 print(f" No significant differences (p ≥ 0.05)")
175
176 # Compare with one-way ANOVA
177 f_stat, anova_p = stats.f_oneway(group1, group2, group3)
178
179 print(f"\nFor comparison, one-way ANOVA p-value: {anova_p:.6f}")
180
181 if HAS_PLT:
182 plt.figure(figsize=(10, 6))
183 plt.boxplot([group1, group2, group3],
184 labels=['Group 1', 'Group 2', 'Group 3'])
185 plt.ylabel('Value')
186 plt.title(f'Kruskal-Wallis Test (H={statistic:.2f}, p={p_value:.4f})')
187 plt.grid(True, alpha=0.3)
188 plt.savefig('/tmp/kruskal_wallis.png', dpi=100)
189 print("\n[Plot saved to /tmp/kruskal_wallis.png]")
190 plt.close()
191
192
193def kolmogorov_smirnov_test():
194 """Demonstrate Kolmogorov-Smirnov test."""
195 print_section("4. Kolmogorov-Smirnov Test")
196
197 print("Testing goodness-of-fit to a distribution")
198
199 # Generate data
200 np.random.seed(789)
201 n = 100
202
203 # Sample 1: Actually normal
204 sample_normal = np.random.normal(5, 2, n)
205
206 # Sample 2: Mixture (not normal)
207 sample_mixture = np.concatenate([
208 np.random.normal(3, 1, n//2),
209 np.random.normal(7, 1, n//2)
210 ])
211
212 print(f"\nSample size: {n}")
213
214 # Test against normal distribution
215 print(f"\n1. Sample from N(5, 2²) tested against N(5, 2²):")
216
217 stat1, p1 = stats.kstest(sample_normal, 'norm',
218 args=(np.mean(sample_normal), np.std(sample_normal, ddof=1)))
219
220 print(f" KS statistic: {stat1:.4f}")
221 print(f" p-value: {p1:.6f}")
222 if p1 > 0.05:
223 print(f" Consistent with normal distribution")
224
225 print(f"\n2. Sample from mixture tested against normal:")
226
227 stat2, p2 = stats.kstest(sample_mixture, 'norm',
228 args=(np.mean(sample_mixture), np.std(sample_mixture, ddof=1)))
229
230 print(f" KS statistic: {stat2:.4f}")
231 print(f" p-value: {p2:.6f}")
232 if p2 < 0.05:
233 print(f" Not consistent with normal distribution (p < 0.05)")
234
235 # Two-sample KS test
236 print(f"\n3. Two-sample KS test (comparing two samples):")
237
238 sample_a = np.random.normal(5, 2, 80)
239 sample_b = np.random.normal(5.5, 2, 80)
240
241 stat_2s, p_2s = stats.ks_2samp(sample_a, sample_b)
242
243 print(f" KS statistic: {stat_2s:.4f}")
244 print(f" p-value: {p_2s:.6f}")
245
246 if HAS_PLT:
247 fig, axes = plt.subplots(2, 2, figsize=(12, 10))
248
249 # Normal sample - histogram
250 axes[0, 0].hist(sample_normal, bins=20, density=True, alpha=0.7, edgecolor='black')
251 x = np.linspace(sample_normal.min(), sample_normal.max(), 100)
252 axes[0, 0].plot(x, stats.norm.pdf(x, np.mean(sample_normal), np.std(sample_normal, ddof=1)),
253 'r-', linewidth=2, label='Fitted normal')
254 axes[0, 0].set_xlabel('Value')
255 axes[0, 0].set_ylabel('Density')
256 axes[0, 0].set_title(f'Normal Sample (KS p={p1:.3f})')
257 axes[0, 0].legend()
258 axes[0, 0].grid(True, alpha=0.3)
259
260 # Normal sample - ECDF
261 sorted_normal = np.sort(sample_normal)
262 ecdf_normal = np.arange(1, len(sorted_normal) + 1) / len(sorted_normal)
263 axes[0, 1].plot(sorted_normal, ecdf_normal, 'b-', linewidth=2, label='ECDF')
264 axes[0, 1].plot(x, stats.norm.cdf(x, np.mean(sample_normal), np.std(sample_normal, ddof=1)),
265 'r--', linewidth=2, label='Theoretical CDF')
266 axes[0, 1].set_xlabel('Value')
267 axes[0, 1].set_ylabel('Cumulative Probability')
268 axes[0, 1].set_title('Normal Sample - CDF Comparison')
269 axes[0, 1].legend()
270 axes[0, 1].grid(True, alpha=0.3)
271
272 # Mixture sample - histogram
273 axes[1, 0].hist(sample_mixture, bins=20, density=True, alpha=0.7, edgecolor='black')
274 x2 = np.linspace(sample_mixture.min(), sample_mixture.max(), 100)
275 axes[1, 0].plot(x2, stats.norm.pdf(x2, np.mean(sample_mixture), np.std(sample_mixture, ddof=1)),
276 'r-', linewidth=2, label='Fitted normal')
277 axes[1, 0].set_xlabel('Value')
278 axes[1, 0].set_ylabel('Density')
279 axes[1, 0].set_title(f'Mixture Sample (KS p={p2:.3f})')
280 axes[1, 0].legend()
281 axes[1, 0].grid(True, alpha=0.3)
282
283 # Mixture sample - ECDF
284 sorted_mixture = np.sort(sample_mixture)
285 ecdf_mixture = np.arange(1, len(sorted_mixture) + 1) / len(sorted_mixture)
286 axes[1, 1].plot(sorted_mixture, ecdf_mixture, 'b-', linewidth=2, label='ECDF')
287 axes[1, 1].plot(x2, stats.norm.cdf(x2, np.mean(sample_mixture), np.std(sample_mixture, ddof=1)),
288 'r--', linewidth=2, label='Theoretical CDF')
289 axes[1, 1].set_xlabel('Value')
290 axes[1, 1].set_ylabel('Cumulative Probability')
291 axes[1, 1].set_title('Mixture Sample - CDF Comparison')
292 axes[1, 1].legend()
293 axes[1, 1].grid(True, alpha=0.3)
294
295 plt.tight_layout()
296 plt.savefig('/tmp/ks_test.png', dpi=100)
297 print("\n[Plot saved to /tmp/ks_test.png]")
298 plt.close()
299
300
301def kernel_density_estimation():
302 """Demonstrate kernel density estimation."""
303 print_section("5. Kernel Density Estimation")
304
305 # Generate bimodal data
306 np.random.seed(999)
307 n1, n2 = 100, 80
308 mode1 = np.random.normal(2, 0.8, n1)
309 mode2 = np.random.normal(6, 1.2, n2)
310 data = np.concatenate([mode1, mode2])
311
312 print(f"Generated bimodal data: {len(data)} points")
313 print(f"Mode 1: n={n1}, mean={np.mean(mode1):.2f}")
314 print(f"Mode 2: n={n2}, mean={np.mean(mode2):.2f}")
315
316 # Kernel density estimation with different bandwidths
317 bandwidths = [0.3, 0.5, 1.0, 2.0]
318
319 print(f"\nKernel density estimation with different bandwidths:")
320
321 x_grid = np.linspace(data.min() - 2, data.max() + 2, 500)
322
323 if HAS_PLT:
324 plt.figure(figsize=(12, 8))
325
326 for i, bw in enumerate(bandwidths, 1):
327 # Gaussian KDE
328 kde = stats.gaussian_kde(data, bw_method=bw)
329 density = kde(x_grid)
330
331 print(f"\n Bandwidth = {bw}:")
332 print(f" Peak density: {density.max():.4f}")
333
334 plt.subplot(2, 2, i)
335 plt.hist(data, bins=30, density=True, alpha=0.5, edgecolor='black')
336 plt.plot(x_grid, density, 'r-', linewidth=2, label=f'KDE (bw={bw})')
337 plt.xlabel('Value')
338 plt.ylabel('Density')
339 plt.title(f'KDE with Bandwidth = {bw}')
340 plt.legend()
341 plt.grid(True, alpha=0.3)
342
343 plt.tight_layout()
344 plt.savefig('/tmp/kde.png', dpi=100)
345 print("\n[Plot saved to /tmp/kde.png]")
346 plt.close()
347
348 # Optimal bandwidth (Scott's rule)
349 kde_scott = stats.gaussian_kde(data, bw_method='scott')
350 optimal_bw = kde_scott.factor * data.std(ddof=1)
351
352 print(f"\nScott's rule optimal bandwidth: {optimal_bw:.4f}")
353
354
355def bootstrap_confidence_intervals():
356 """Demonstrate bootstrap confidence intervals."""
357 print_section("6. Bootstrap Confidence Intervals")
358
359 # Generate skewed data
360 np.random.seed(111)
361 data = np.random.gamma(2, 2, 80)
362
363 print(f"Sample size: {len(data)}")
364 print(f"Sample mean: {np.mean(data):.2f}")
365 print(f"Sample median: {np.median(data):.2f}")
366 print(f"Sample std: {np.std(data, ddof=1):.2f}")
367
368 # Bootstrap
369 n_bootstrap = 5000
370 alpha = 0.05
371
372 print(f"\nBootstrap resampling: {n_bootstrap} iterations")
373
374 # Statistics to bootstrap
375 boot_means = []
376 boot_medians = []
377 boot_stds = []
378 boot_trimmed_means = []
379
380 for _ in range(n_bootstrap):
381 boot_sample = np.random.choice(data, len(data), replace=True)
382 boot_means.append(np.mean(boot_sample))
383 boot_medians.append(np.median(boot_sample))
384 boot_stds.append(np.std(boot_sample, ddof=1))
385 # 10% trimmed mean
386 boot_trimmed_means.append(stats.trim_mean(boot_sample, 0.1))
387
388 boot_means = np.array(boot_means)
389 boot_medians = np.array(boot_medians)
390 boot_stds = np.array(boot_stds)
391 boot_trimmed_means = np.array(boot_trimmed_means)
392
393 print(f"\nBootstrap confidence intervals (95%):")
394
395 # Percentile method
396 ci_mean = np.percentile(boot_means, [2.5, 97.5])
397 ci_median = np.percentile(boot_medians, [2.5, 97.5])
398 ci_std = np.percentile(boot_stds, [2.5, 97.5])
399 ci_trimmed = np.percentile(boot_trimmed_means, [2.5, 97.5])
400
401 print(f"\n Mean:")
402 print(f" Point estimate: {np.mean(data):.2f}")
403 print(f" Bootstrap CI: [{ci_mean[0]:.2f}, {ci_mean[1]:.2f}]")
404 print(f" Bootstrap SE: {np.std(boot_means, ddof=1):.2f}")
405
406 print(f"\n Median:")
407 print(f" Point estimate: {np.median(data):.2f}")
408 print(f" Bootstrap CI: [{ci_median[0]:.2f}, {ci_median[1]:.2f}]")
409
410 print(f"\n Std Dev:")
411 print(f" Point estimate: {np.std(data, ddof=1):.2f}")
412 print(f" Bootstrap CI: [{ci_std[0]:.2f}, {ci_std[1]:.2f}]")
413
414 print(f"\n Trimmed Mean (10%):")
415 print(f" Point estimate: {stats.trim_mean(data, 0.1):.2f}")
416 print(f" Bootstrap CI: [{ci_trimmed[0]:.2f}, {ci_trimmed[1]:.2f}]")
417
418 if HAS_PLT:
419 fig, axes = plt.subplots(2, 2, figsize=(12, 10))
420
421 # Mean
422 axes[0, 0].hist(boot_means, bins=50, edgecolor='black', alpha=0.7)
423 axes[0, 0].axvline(np.mean(data), color='red', linestyle='--', linewidth=2, label='Sample mean')
424 axes[0, 0].axvline(ci_mean[0], color='green', linestyle='--', linewidth=1.5)
425 axes[0, 0].axvline(ci_mean[1], color='green', linestyle='--', linewidth=1.5, label='95% CI')
426 axes[0, 0].set_xlabel('Mean')
427 axes[0, 0].set_ylabel('Frequency')
428 axes[0, 0].set_title('Bootstrap Distribution of Mean')
429 axes[0, 0].legend()
430 axes[0, 0].grid(True, alpha=0.3)
431
432 # Median
433 axes[0, 1].hist(boot_medians, bins=50, edgecolor='black', alpha=0.7, color='orange')
434 axes[0, 1].axvline(np.median(data), color='red', linestyle='--', linewidth=2, label='Sample median')
435 axes[0, 1].axvline(ci_median[0], color='green', linestyle='--', linewidth=1.5)
436 axes[0, 1].axvline(ci_median[1], color='green', linestyle='--', linewidth=1.5, label='95% CI')
437 axes[0, 1].set_xlabel('Median')
438 axes[0, 1].set_ylabel('Frequency')
439 axes[0, 1].set_title('Bootstrap Distribution of Median')
440 axes[0, 1].legend()
441 axes[0, 1].grid(True, alpha=0.3)
442
443 # Std
444 axes[1, 0].hist(boot_stds, bins=50, edgecolor='black', alpha=0.7, color='green')
445 axes[1, 0].axvline(np.std(data, ddof=1), color='red', linestyle='--', linewidth=2, label='Sample std')
446 axes[1, 0].axvline(ci_std[0], color='blue', linestyle='--', linewidth=1.5)
447 axes[1, 0].axvline(ci_std[1], color='blue', linestyle='--', linewidth=1.5, label='95% CI')
448 axes[1, 0].set_xlabel('Standard Deviation')
449 axes[1, 0].set_ylabel('Frequency')
450 axes[1, 0].set_title('Bootstrap Distribution of Std Dev')
451 axes[1, 0].legend()
452 axes[1, 0].grid(True, alpha=0.3)
453
454 # Trimmed mean
455 axes[1, 1].hist(boot_trimmed_means, bins=50, edgecolor='black', alpha=0.7, color='purple')
456 axes[1, 1].axvline(stats.trim_mean(data, 0.1), color='red', linestyle='--', linewidth=2, label='Sample')
457 axes[1, 1].axvline(ci_trimmed[0], color='green', linestyle='--', linewidth=1.5)
458 axes[1, 1].axvline(ci_trimmed[1], color='green', linestyle='--', linewidth=1.5, label='95% CI')
459 axes[1, 1].set_xlabel('Trimmed Mean (10%)')
460 axes[1, 1].set_ylabel('Frequency')
461 axes[1, 1].set_title('Bootstrap Distribution of Trimmed Mean')
462 axes[1, 1].legend()
463 axes[1, 1].grid(True, alpha=0.3)
464
465 plt.tight_layout()
466 plt.savefig('/tmp/bootstrap_ci.png', dpi=100)
467 print("\n[Plot saved to /tmp/bootstrap_ci.png]")
468 plt.close()
469
470
471def main():
472 """Run all demonstrations."""
473 print("=" * 70)
474 print(" NONPARAMETRIC METHODS DEMONSTRATIONS")
475 print("=" * 70)
476
477 np.random.seed(42)
478
479 mann_whitney_test()
480 wilcoxon_signed_rank()
481 kruskal_wallis_test()
482 kolmogorov_smirnov_test()
483 kernel_density_estimation()
484 bootstrap_confidence_intervals()
485
486 print("\n" + "=" * 70)
487 print(" All demonstrations completed successfully!")
488 print("=" * 70)
489
490
491if __name__ == "__main__":
492 main()