10_nonparametric.py

  1"""
  210_nonparametric.py
  3
  4Demonstrates nonparametric statistical methods:
  5- Mann-Whitney U test
  6- Wilcoxon signed-rank test
  7- Kruskal-Wallis test
  8- Kolmogorov-Smirnov test
  9- Kernel density estimation
 10- Bootstrap confidence intervals
 11"""
 12
 13import numpy as np
 14from scipy import stats
 15
 16try:
 17    import matplotlib.pyplot as plt
 18    HAS_PLT = True
 19except ImportError:
 20    HAS_PLT = False
 21    print("matplotlib not available; skipping plots\n")
 22
 23
 24def print_section(title):
 25    """Print formatted section header."""
 26    print("\n" + "=" * 70)
 27    print(f"  {title}")
 28    print("=" * 70)
 29
 30
 31def mann_whitney_test():
 32    """Demonstrate Mann-Whitney U test."""
 33    print_section("1. Mann-Whitney U Test")
 34
 35    print("Comparing two independent samples (non-normal distributions)")
 36
 37    # Generate skewed data
 38    np.random.seed(42)
 39    group1 = np.random.exponential(2, 40)
 40    group2 = np.random.exponential(2.5, 35)
 41
 42    print(f"\nGroup 1: n={len(group1)}")
 43    print(f"  Mean: {np.mean(group1):.2f}")
 44    print(f"  Median: {np.median(group1):.2f}")
 45    print(f"  Std: {np.std(group1, ddof=1):.2f}")
 46
 47    print(f"\nGroup 2: n={len(group2)}")
 48    print(f"  Mean: {np.mean(group2):.2f}")
 49    print(f"  Median: {np.median(group2):.2f}")
 50    print(f"  Std: {np.std(group2, ddof=1):.2f}")
 51
 52    # Mann-Whitney U test
 53    statistic, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')
 54
 55    print(f"\nMann-Whitney U test:")
 56    print(f"  U statistic: {statistic:.2f}")
 57    print(f"  p-value: {p_value:.6f}")
 58
 59    if p_value < 0.05:
 60        print(f"  Significant difference in distributions (p < 0.05)")
 61    else:
 62        print(f"  No significant difference (p ≥ 0.05)")
 63
 64    # Compare with t-test (for comparison)
 65    t_stat, t_p = stats.ttest_ind(group1, group2)
 66
 67    print(f"\nFor comparison, t-test p-value: {t_p:.6f}")
 68    print(f"  (t-test assumes normality; Mann-Whitney does not)")
 69
 70    if HAS_PLT:
 71        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
 72
 73        # Histograms
 74        axes[0].hist(group1, bins=20, alpha=0.7, label='Group 1', density=True)
 75        axes[0].hist(group2, bins=20, alpha=0.7, label='Group 2', density=True)
 76        axes[0].axvline(np.median(group1), color='C0', linestyle='--', linewidth=2)
 77        axes[0].axvline(np.median(group2), color='C1', linestyle='--', linewidth=2)
 78        axes[0].set_xlabel('Value')
 79        axes[0].set_ylabel('Density')
 80        axes[0].set_title('Distribution Comparison')
 81        axes[0].legend()
 82        axes[0].grid(True, alpha=0.3)
 83
 84        # Box plots
 85        axes[1].boxplot([group1, group2], labels=['Group 1', 'Group 2'])
 86        axes[1].set_ylabel('Value')
 87        axes[1].set_title('Box Plot Comparison')
 88        axes[1].grid(True, alpha=0.3)
 89
 90        plt.tight_layout()
 91        plt.savefig('/tmp/mann_whitney.png', dpi=100)
 92        print("\n[Plot saved to /tmp/mann_whitney.png]")
 93        plt.close()
 94
 95
 96def wilcoxon_signed_rank():
 97    """Demonstrate Wilcoxon signed-rank test."""
 98    print_section("2. Wilcoxon Signed-Rank Test")
 99
100    print("Paired comparison (non-normal differences)")
101
102    # Generate paired data
103    np.random.seed(123)
104    n = 30
105    before = np.random.gamma(3, 2, n)
106    treatment_effect = np.random.exponential(1.5, n)
107    after = before + treatment_effect
108
109    differences = after - before
110
111    print(f"\nPaired data: n={n}")
112    print(f"\nBefore:")
113    print(f"  Median: {np.median(before):.2f}")
114    print(f"After:")
115    print(f"  Median: {np.median(after):.2f}")
116
117    print(f"\nDifferences (after - before):")
118    print(f"  Median: {np.median(differences):.2f}")
119    print(f"  Mean: {np.mean(differences):.2f}")
120
121    # Wilcoxon signed-rank test
122    statistic, p_value = stats.wilcoxon(after, before, alternative='greater')
123
124    print(f"\nWilcoxon signed-rank test (H₁: after > before):")
125    print(f"  Statistic: {statistic:.2f}")
126    print(f"  p-value: {p_value:.6f}")
127
128    if p_value < 0.05:
129        print(f"  Significant increase detected (p < 0.05)")
130    else:
131        print(f"  No significant increase (p ≥ 0.05)")
132
133    # Compare with paired t-test
134    t_stat, t_p = stats.ttest_rel(after, before)
135
136    print(f"\nFor comparison, paired t-test p-value: {t_p/2:.6f}")
137    print(f"  (one-sided)")
138
139    # Check normality of differences
140    shapiro_stat, shapiro_p = stats.shapiro(differences)
141
142    print(f"\nShapiro-Wilk test on differences:")
143    print(f"  p-value: {shapiro_p:.6f}")
144    if shapiro_p < 0.05:
145        print(f"  Differences not normally distributed (Wilcoxon appropriate)")
146
147
148def kruskal_wallis_test():
149    """Demonstrate Kruskal-Wallis test."""
150    print_section("3. Kruskal-Wallis Test")
151
152    print("Comparing multiple independent groups (nonparametric ANOVA)")
153
154    # Generate data from different distributions
155    np.random.seed(456)
156    group1 = np.random.lognormal(1, 0.5, 25)
157    group2 = np.random.lognormal(1.2, 0.5, 30)
158    group3 = np.random.lognormal(1.4, 0.5, 28)
159
160    print(f"\nGroup 1: n={len(group1)}, median={np.median(group1):.2f}")
161    print(f"Group 2: n={len(group2)}, median={np.median(group2):.2f}")
162    print(f"Group 3: n={len(group3)}, median={np.median(group3):.2f}")
163
164    # Kruskal-Wallis test
165    statistic, p_value = stats.kruskal(group1, group2, group3)
166
167    print(f"\nKruskal-Wallis test:")
168    print(f"  H statistic: {statistic:.4f}")
169    print(f"  p-value: {p_value:.6f}")
170
171    if p_value < 0.05:
172        print(f"  Significant differences among groups (p < 0.05)")
173    else:
174        print(f"  No significant differences (p ≥ 0.05)")
175
176    # Compare with one-way ANOVA
177    f_stat, anova_p = stats.f_oneway(group1, group2, group3)
178
179    print(f"\nFor comparison, one-way ANOVA p-value: {anova_p:.6f}")
180
181    if HAS_PLT:
182        plt.figure(figsize=(10, 6))
183        plt.boxplot([group1, group2, group3],
184                   labels=['Group 1', 'Group 2', 'Group 3'])
185        plt.ylabel('Value')
186        plt.title(f'Kruskal-Wallis Test (H={statistic:.2f}, p={p_value:.4f})')
187        plt.grid(True, alpha=0.3)
188        plt.savefig('/tmp/kruskal_wallis.png', dpi=100)
189        print("\n[Plot saved to /tmp/kruskal_wallis.png]")
190        plt.close()
191
192
193def kolmogorov_smirnov_test():
194    """Demonstrate Kolmogorov-Smirnov test."""
195    print_section("4. Kolmogorov-Smirnov Test")
196
197    print("Testing goodness-of-fit to a distribution")
198
199    # Generate data
200    np.random.seed(789)
201    n = 100
202
203    # Sample 1: Actually normal
204    sample_normal = np.random.normal(5, 2, n)
205
206    # Sample 2: Mixture (not normal)
207    sample_mixture = np.concatenate([
208        np.random.normal(3, 1, n//2),
209        np.random.normal(7, 1, n//2)
210    ])
211
212    print(f"\nSample size: {n}")
213
214    # Test against normal distribution
215    print(f"\n1. Sample from N(5, 2²) tested against N(5, 2²):")
216
217    stat1, p1 = stats.kstest(sample_normal, 'norm',
218                             args=(np.mean(sample_normal), np.std(sample_normal, ddof=1)))
219
220    print(f"  KS statistic: {stat1:.4f}")
221    print(f"  p-value: {p1:.6f}")
222    if p1 > 0.05:
223        print(f"  Consistent with normal distribution")
224
225    print(f"\n2. Sample from mixture tested against normal:")
226
227    stat2, p2 = stats.kstest(sample_mixture, 'norm',
228                             args=(np.mean(sample_mixture), np.std(sample_mixture, ddof=1)))
229
230    print(f"  KS statistic: {stat2:.4f}")
231    print(f"  p-value: {p2:.6f}")
232    if p2 < 0.05:
233        print(f"  Not consistent with normal distribution (p < 0.05)")
234
235    # Two-sample KS test
236    print(f"\n3. Two-sample KS test (comparing two samples):")
237
238    sample_a = np.random.normal(5, 2, 80)
239    sample_b = np.random.normal(5.5, 2, 80)
240
241    stat_2s, p_2s = stats.ks_2samp(sample_a, sample_b)
242
243    print(f"  KS statistic: {stat_2s:.4f}")
244    print(f"  p-value: {p_2s:.6f}")
245
246    if HAS_PLT:
247        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
248
249        # Normal sample - histogram
250        axes[0, 0].hist(sample_normal, bins=20, density=True, alpha=0.7, edgecolor='black')
251        x = np.linspace(sample_normal.min(), sample_normal.max(), 100)
252        axes[0, 0].plot(x, stats.norm.pdf(x, np.mean(sample_normal), np.std(sample_normal, ddof=1)),
253                       'r-', linewidth=2, label='Fitted normal')
254        axes[0, 0].set_xlabel('Value')
255        axes[0, 0].set_ylabel('Density')
256        axes[0, 0].set_title(f'Normal Sample (KS p={p1:.3f})')
257        axes[0, 0].legend()
258        axes[0, 0].grid(True, alpha=0.3)
259
260        # Normal sample - ECDF
261        sorted_normal = np.sort(sample_normal)
262        ecdf_normal = np.arange(1, len(sorted_normal) + 1) / len(sorted_normal)
263        axes[0, 1].plot(sorted_normal, ecdf_normal, 'b-', linewidth=2, label='ECDF')
264        axes[0, 1].plot(x, stats.norm.cdf(x, np.mean(sample_normal), np.std(sample_normal, ddof=1)),
265                       'r--', linewidth=2, label='Theoretical CDF')
266        axes[0, 1].set_xlabel('Value')
267        axes[0, 1].set_ylabel('Cumulative Probability')
268        axes[0, 1].set_title('Normal Sample - CDF Comparison')
269        axes[0, 1].legend()
270        axes[0, 1].grid(True, alpha=0.3)
271
272        # Mixture sample - histogram
273        axes[1, 0].hist(sample_mixture, bins=20, density=True, alpha=0.7, edgecolor='black')
274        x2 = np.linspace(sample_mixture.min(), sample_mixture.max(), 100)
275        axes[1, 0].plot(x2, stats.norm.pdf(x2, np.mean(sample_mixture), np.std(sample_mixture, ddof=1)),
276                       'r-', linewidth=2, label='Fitted normal')
277        axes[1, 0].set_xlabel('Value')
278        axes[1, 0].set_ylabel('Density')
279        axes[1, 0].set_title(f'Mixture Sample (KS p={p2:.3f})')
280        axes[1, 0].legend()
281        axes[1, 0].grid(True, alpha=0.3)
282
283        # Mixture sample - ECDF
284        sorted_mixture = np.sort(sample_mixture)
285        ecdf_mixture = np.arange(1, len(sorted_mixture) + 1) / len(sorted_mixture)
286        axes[1, 1].plot(sorted_mixture, ecdf_mixture, 'b-', linewidth=2, label='ECDF')
287        axes[1, 1].plot(x2, stats.norm.cdf(x2, np.mean(sample_mixture), np.std(sample_mixture, ddof=1)),
288                       'r--', linewidth=2, label='Theoretical CDF')
289        axes[1, 1].set_xlabel('Value')
290        axes[1, 1].set_ylabel('Cumulative Probability')
291        axes[1, 1].set_title('Mixture Sample - CDF Comparison')
292        axes[1, 1].legend()
293        axes[1, 1].grid(True, alpha=0.3)
294
295        plt.tight_layout()
296        plt.savefig('/tmp/ks_test.png', dpi=100)
297        print("\n[Plot saved to /tmp/ks_test.png]")
298        plt.close()
299
300
301def kernel_density_estimation():
302    """Demonstrate kernel density estimation."""
303    print_section("5. Kernel Density Estimation")
304
305    # Generate bimodal data
306    np.random.seed(999)
307    n1, n2 = 100, 80
308    mode1 = np.random.normal(2, 0.8, n1)
309    mode2 = np.random.normal(6, 1.2, n2)
310    data = np.concatenate([mode1, mode2])
311
312    print(f"Generated bimodal data: {len(data)} points")
313    print(f"Mode 1: n={n1}, mean={np.mean(mode1):.2f}")
314    print(f"Mode 2: n={n2}, mean={np.mean(mode2):.2f}")
315
316    # Kernel density estimation with different bandwidths
317    bandwidths = [0.3, 0.5, 1.0, 2.0]
318
319    print(f"\nKernel density estimation with different bandwidths:")
320
321    x_grid = np.linspace(data.min() - 2, data.max() + 2, 500)
322
323    if HAS_PLT:
324        plt.figure(figsize=(12, 8))
325
326        for i, bw in enumerate(bandwidths, 1):
327            # Gaussian KDE
328            kde = stats.gaussian_kde(data, bw_method=bw)
329            density = kde(x_grid)
330
331            print(f"\n  Bandwidth = {bw}:")
332            print(f"    Peak density: {density.max():.4f}")
333
334            plt.subplot(2, 2, i)
335            plt.hist(data, bins=30, density=True, alpha=0.5, edgecolor='black')
336            plt.plot(x_grid, density, 'r-', linewidth=2, label=f'KDE (bw={bw})')
337            plt.xlabel('Value')
338            plt.ylabel('Density')
339            plt.title(f'KDE with Bandwidth = {bw}')
340            plt.legend()
341            plt.grid(True, alpha=0.3)
342
343        plt.tight_layout()
344        plt.savefig('/tmp/kde.png', dpi=100)
345        print("\n[Plot saved to /tmp/kde.png]")
346        plt.close()
347
348    # Optimal bandwidth (Scott's rule)
349    kde_scott = stats.gaussian_kde(data, bw_method='scott')
350    optimal_bw = kde_scott.factor * data.std(ddof=1)
351
352    print(f"\nScott's rule optimal bandwidth: {optimal_bw:.4f}")
353
354
355def bootstrap_confidence_intervals():
356    """Demonstrate bootstrap confidence intervals."""
357    print_section("6. Bootstrap Confidence Intervals")
358
359    # Generate skewed data
360    np.random.seed(111)
361    data = np.random.gamma(2, 2, 80)
362
363    print(f"Sample size: {len(data)}")
364    print(f"Sample mean: {np.mean(data):.2f}")
365    print(f"Sample median: {np.median(data):.2f}")
366    print(f"Sample std: {np.std(data, ddof=1):.2f}")
367
368    # Bootstrap
369    n_bootstrap = 5000
370    alpha = 0.05
371
372    print(f"\nBootstrap resampling: {n_bootstrap} iterations")
373
374    # Statistics to bootstrap
375    boot_means = []
376    boot_medians = []
377    boot_stds = []
378    boot_trimmed_means = []
379
380    for _ in range(n_bootstrap):
381        boot_sample = np.random.choice(data, len(data), replace=True)
382        boot_means.append(np.mean(boot_sample))
383        boot_medians.append(np.median(boot_sample))
384        boot_stds.append(np.std(boot_sample, ddof=1))
385        # 10% trimmed mean
386        boot_trimmed_means.append(stats.trim_mean(boot_sample, 0.1))
387
388    boot_means = np.array(boot_means)
389    boot_medians = np.array(boot_medians)
390    boot_stds = np.array(boot_stds)
391    boot_trimmed_means = np.array(boot_trimmed_means)
392
393    print(f"\nBootstrap confidence intervals (95%):")
394
395    # Percentile method
396    ci_mean = np.percentile(boot_means, [2.5, 97.5])
397    ci_median = np.percentile(boot_medians, [2.5, 97.5])
398    ci_std = np.percentile(boot_stds, [2.5, 97.5])
399    ci_trimmed = np.percentile(boot_trimmed_means, [2.5, 97.5])
400
401    print(f"\n  Mean:")
402    print(f"    Point estimate: {np.mean(data):.2f}")
403    print(f"    Bootstrap CI: [{ci_mean[0]:.2f}, {ci_mean[1]:.2f}]")
404    print(f"    Bootstrap SE: {np.std(boot_means, ddof=1):.2f}")
405
406    print(f"\n  Median:")
407    print(f"    Point estimate: {np.median(data):.2f}")
408    print(f"    Bootstrap CI: [{ci_median[0]:.2f}, {ci_median[1]:.2f}]")
409
410    print(f"\n  Std Dev:")
411    print(f"    Point estimate: {np.std(data, ddof=1):.2f}")
412    print(f"    Bootstrap CI: [{ci_std[0]:.2f}, {ci_std[1]:.2f}]")
413
414    print(f"\n  Trimmed Mean (10%):")
415    print(f"    Point estimate: {stats.trim_mean(data, 0.1):.2f}")
416    print(f"    Bootstrap CI: [{ci_trimmed[0]:.2f}, {ci_trimmed[1]:.2f}]")
417
418    if HAS_PLT:
419        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
420
421        # Mean
422        axes[0, 0].hist(boot_means, bins=50, edgecolor='black', alpha=0.7)
423        axes[0, 0].axvline(np.mean(data), color='red', linestyle='--', linewidth=2, label='Sample mean')
424        axes[0, 0].axvline(ci_mean[0], color='green', linestyle='--', linewidth=1.5)
425        axes[0, 0].axvline(ci_mean[1], color='green', linestyle='--', linewidth=1.5, label='95% CI')
426        axes[0, 0].set_xlabel('Mean')
427        axes[0, 0].set_ylabel('Frequency')
428        axes[0, 0].set_title('Bootstrap Distribution of Mean')
429        axes[0, 0].legend()
430        axes[0, 0].grid(True, alpha=0.3)
431
432        # Median
433        axes[0, 1].hist(boot_medians, bins=50, edgecolor='black', alpha=0.7, color='orange')
434        axes[0, 1].axvline(np.median(data), color='red', linestyle='--', linewidth=2, label='Sample median')
435        axes[0, 1].axvline(ci_median[0], color='green', linestyle='--', linewidth=1.5)
436        axes[0, 1].axvline(ci_median[1], color='green', linestyle='--', linewidth=1.5, label='95% CI')
437        axes[0, 1].set_xlabel('Median')
438        axes[0, 1].set_ylabel('Frequency')
439        axes[0, 1].set_title('Bootstrap Distribution of Median')
440        axes[0, 1].legend()
441        axes[0, 1].grid(True, alpha=0.3)
442
443        # Std
444        axes[1, 0].hist(boot_stds, bins=50, edgecolor='black', alpha=0.7, color='green')
445        axes[1, 0].axvline(np.std(data, ddof=1), color='red', linestyle='--', linewidth=2, label='Sample std')
446        axes[1, 0].axvline(ci_std[0], color='blue', linestyle='--', linewidth=1.5)
447        axes[1, 0].axvline(ci_std[1], color='blue', linestyle='--', linewidth=1.5, label='95% CI')
448        axes[1, 0].set_xlabel('Standard Deviation')
449        axes[1, 0].set_ylabel('Frequency')
450        axes[1, 0].set_title('Bootstrap Distribution of Std Dev')
451        axes[1, 0].legend()
452        axes[1, 0].grid(True, alpha=0.3)
453
454        # Trimmed mean
455        axes[1, 1].hist(boot_trimmed_means, bins=50, edgecolor='black', alpha=0.7, color='purple')
456        axes[1, 1].axvline(stats.trim_mean(data, 0.1), color='red', linestyle='--', linewidth=2, label='Sample')
457        axes[1, 1].axvline(ci_trimmed[0], color='green', linestyle='--', linewidth=1.5)
458        axes[1, 1].axvline(ci_trimmed[1], color='green', linestyle='--', linewidth=1.5, label='95% CI')
459        axes[1, 1].set_xlabel('Trimmed Mean (10%)')
460        axes[1, 1].set_ylabel('Frequency')
461        axes[1, 1].set_title('Bootstrap Distribution of Trimmed Mean')
462        axes[1, 1].legend()
463        axes[1, 1].grid(True, alpha=0.3)
464
465        plt.tight_layout()
466        plt.savefig('/tmp/bootstrap_ci.png', dpi=100)
467        print("\n[Plot saved to /tmp/bootstrap_ci.png]")
468        plt.close()
469
470
471def main():
472    """Run all demonstrations."""
473    print("=" * 70)
474    print("  NONPARAMETRIC METHODS DEMONSTRATIONS")
475    print("=" * 70)
476
477    np.random.seed(42)
478
479    mann_whitney_test()
480    wilcoxon_signed_rank()
481    kruskal_wallis_test()
482    kolmogorov_smirnov_test()
483    kernel_density_estimation()
484    bootstrap_confidence_intervals()
485
486    print("\n" + "=" * 70)
487    print("  All demonstrations completed successfully!")
488    print("=" * 70)
489
490
491if __name__ == "__main__":
492    main()