02_sampling_estimation.py

  1"""
  202_sampling_estimation.py
  3
  4Demonstrates sampling methods and estimation techniques:
  5- Simple random sampling
  6- Stratified sampling
  7- Bootstrap estimation
  8- Confidence intervals
  9- Bias and variance of estimators
 10- Maximum Likelihood Estimation (MLE)
 11"""
 12
 13import numpy as np
 14from scipy import stats
 15
 16try:
 17    import matplotlib.pyplot as plt
 18    HAS_PLT = True
 19except ImportError:
 20    HAS_PLT = False
 21    print("matplotlib not available; skipping plots\n")
 22
 23
 24def print_section(title):
 25    """Print formatted section header."""
 26    print("\n" + "=" * 70)
 27    print(f"  {title}")
 28    print("=" * 70)
 29
 30
 31def simple_random_sampling():
 32    """Demonstrate simple random sampling."""
 33    print_section("1. Simple Random Sampling")
 34
 35    # Create population
 36    population_size = 10000
 37    population = np.random.normal(100, 15, population_size)
 38
 39    print(f"Population size: {population_size}")
 40    print(f"Population mean: {np.mean(population):.2f}")
 41    print(f"Population std: {np.std(population, ddof=0):.2f}")
 42
 43    # Different sample sizes
 44    sample_sizes = [10, 50, 100, 500]
 45    n_replications = 1000
 46
 47    print(f"\nSimple random sampling ({n_replications} replications):")
 48
 49    for n in sample_sizes:
 50        sample_means = []
 51        for _ in range(n_replications):
 52            sample = np.random.choice(population, n, replace=False)
 53            sample_means.append(np.mean(sample))
 54
 55        sample_means = np.array(sample_means)
 56
 57        print(f"\n  Sample size n={n}:")
 58        print(f"    Mean of estimates: {np.mean(sample_means):.2f}")
 59        print(f"    Std of estimates: {np.std(sample_means, ddof=1):.2f}")
 60        print(f"    Theoretical SE: {np.std(population)/np.sqrt(n):.2f}")
 61        print(f"    Min estimate: {np.min(sample_means):.2f}")
 62        print(f"    Max estimate: {np.max(sample_means):.2f}")
 63
 64
 65def stratified_sampling():
 66    """Demonstrate stratified sampling."""
 67    print_section("2. Stratified Sampling")
 68
 69    # Create stratified population
 70    # Stratum 1: 60%, mean=90
 71    # Stratum 2: 30%, mean=110
 72    # Stratum 3: 10%, mean=130
 73
 74    n1, n2, n3 = 6000, 3000, 1000
 75    stratum1 = np.random.normal(90, 12, n1)
 76    stratum2 = np.random.normal(110, 15, n2)
 77    stratum3 = np.random.normal(130, 18, n3)
 78
 79    population = np.concatenate([stratum1, stratum2, stratum3])
 80
 81    print("Population composition:")
 82    print(f"  Stratum 1: n={n1}, mean={np.mean(stratum1):.2f}")
 83    print(f"  Stratum 2: n={n2}, mean={np.mean(stratum2):.2f}")
 84    print(f"  Stratum 3: n={n3}, mean={np.mean(stratum3):.2f}")
 85    print(f"  Overall mean: {np.mean(population):.2f}")
 86
 87    # Compare simple vs stratified sampling
 88    total_sample_size = 300
 89    n_replications = 1000
 90
 91    simple_means = []
 92    stratified_means = []
 93
 94    for _ in range(n_replications):
 95        # Simple random sampling
 96        simple_sample = np.random.choice(population, total_sample_size, replace=False)
 97        simple_means.append(np.mean(simple_sample))
 98
 99        # Stratified sampling (proportional allocation)
100        s1_sample = np.random.choice(stratum1, 180, replace=False)  # 60%
101        s2_sample = np.random.choice(stratum2, 90, replace=False)   # 30%
102        s3_sample = np.random.choice(stratum3, 30, replace=False)   # 10%
103        stratified_sample = np.concatenate([s1_sample, s2_sample, s3_sample])
104        stratified_means.append(np.mean(stratified_sample))
105
106    simple_means = np.array(simple_means)
107    stratified_means = np.array(stratified_means)
108
109    print(f"\nComparison (n={total_sample_size}, {n_replications} replications):")
110    print(f"\n  Simple Random Sampling:")
111    print(f"    Mean: {np.mean(simple_means):.2f}")
112    print(f"    Std: {np.std(simple_means, ddof=1):.2f}")
113
114    print(f"\n  Stratified Sampling:")
115    print(f"    Mean: {np.mean(stratified_means):.2f}")
116    print(f"    Std: {np.std(stratified_means, ddof=1):.2f}")
117
118    efficiency = np.var(simple_means, ddof=1) / np.var(stratified_means, ddof=1)
119    print(f"\n  Efficiency gain: {efficiency:.2f}x")
120    print(f"  Variance reduction: {(1 - 1/efficiency)*100:.1f}%")
121
122
123def bootstrap_estimation():
124    """Demonstrate bootstrap estimation."""
125    print_section("3. Bootstrap Estimation")
126
127    # Original sample
128    np.random.seed(42)
129    sample = np.random.lognormal(3, 0.5, 100)
130
131    print(f"Original sample size: {len(sample)}")
132    print(f"Sample mean: {np.mean(sample):.2f}")
133    print(f"Sample median: {np.median(sample):.2f}")
134    print(f"Sample std: {np.std(sample, ddof=1):.2f}")
135
136    # Bootstrap
137    n_bootstrap = 5000
138    bootstrap_means = []
139    bootstrap_medians = []
140    bootstrap_stds = []
141
142    for _ in range(n_bootstrap):
143        bootstrap_sample = np.random.choice(sample, len(sample), replace=True)
144        bootstrap_means.append(np.mean(bootstrap_sample))
145        bootstrap_medians.append(np.median(bootstrap_sample))
146        bootstrap_stds.append(np.std(bootstrap_sample, ddof=1))
147
148    bootstrap_means = np.array(bootstrap_means)
149    bootstrap_medians = np.array(bootstrap_medians)
150    bootstrap_stds = np.array(bootstrap_stds)
151
152    print(f"\nBootstrap results ({n_bootstrap} resamples):")
153
154    print(f"\n  Mean:")
155    print(f"    Bootstrap mean: {np.mean(bootstrap_means):.2f}")
156    print(f"    Bootstrap SE: {np.std(bootstrap_means, ddof=1):.2f}")
157    print(f"    95% CI: [{np.percentile(bootstrap_means, 2.5):.2f}, "
158          f"{np.percentile(bootstrap_means, 97.5):.2f}]")
159
160    print(f"\n  Median:")
161    print(f"    Bootstrap mean: {np.mean(bootstrap_medians):.2f}")
162    print(f"    Bootstrap SE: {np.std(bootstrap_medians, ddof=1):.2f}")
163    print(f"    95% CI: [{np.percentile(bootstrap_medians, 2.5):.2f}, "
164          f"{np.percentile(bootstrap_medians, 97.5):.2f}]")
165
166    if HAS_PLT:
167        fig, axes = plt.subplots(1, 3, figsize=(14, 4))
168
169        axes[0].hist(bootstrap_means, bins=50, alpha=0.7, edgecolor='black')
170        axes[0].axvline(np.mean(sample), color='red', linestyle='--', label='Original')
171        axes[0].set_xlabel('Mean')
172        axes[0].set_title('Bootstrap Distribution of Mean')
173        axes[0].legend()
174        axes[0].grid(True, alpha=0.3)
175
176        axes[1].hist(bootstrap_medians, bins=50, alpha=0.7, edgecolor='black', color='green')
177        axes[1].axvline(np.median(sample), color='red', linestyle='--', label='Original')
178        axes[1].set_xlabel('Median')
179        axes[1].set_title('Bootstrap Distribution of Median')
180        axes[1].legend()
181        axes[1].grid(True, alpha=0.3)
182
183        axes[2].hist(bootstrap_stds, bins=50, alpha=0.7, edgecolor='black', color='orange')
184        axes[2].axvline(np.std(sample, ddof=1), color='red', linestyle='--', label='Original')
185        axes[2].set_xlabel('Std Dev')
186        axes[2].set_title('Bootstrap Distribution of Std Dev')
187        axes[2].legend()
188        axes[2].grid(True, alpha=0.3)
189
190        plt.tight_layout()
191        plt.savefig('/tmp/bootstrap_demo.png', dpi=100)
192        print("\n[Plot saved to /tmp/bootstrap_demo.png]")
193        plt.close()
194
195
196def confidence_intervals():
197    """Demonstrate confidence interval construction."""
198    print_section("4. Confidence Intervals")
199
200    # True population
201    true_mean = 100
202    true_std = 15
203    sample_size = 50
204    n_samples = 1000
205    confidence_level = 0.95
206
207    print(f"Population: N({true_mean}, {true_std}²)")
208    print(f"Sample size: {sample_size}")
209    print(f"Confidence level: {confidence_level*100}%")
210    print(f"Number of samples: {n_samples}")
211
212    # Generate samples and compute CIs
213    coverage_count = 0
214    ci_widths = []
215
216    for _ in range(n_samples):
217        sample = np.random.normal(true_mean, true_std, sample_size)
218        sample_mean = np.mean(sample)
219        sample_se = stats.sem(sample)
220
221        # t-based CI
222        ci = stats.t.interval(confidence_level, sample_size - 1,
223                             loc=sample_mean, scale=sample_se)
224        ci_widths.append(ci[1] - ci[0])
225
226        if ci[0] <= true_mean <= ci[1]:
227            coverage_count += 1
228
229    coverage_rate = coverage_count / n_samples
230
231    print(f"\nResults:")
232    print(f"  Coverage rate: {coverage_rate*100:.1f}%")
233    print(f"  Expected coverage: {confidence_level*100}%")
234    print(f"  Average CI width: {np.mean(ci_widths):.2f}")
235    print(f"  CI width std: {np.std(ci_widths, ddof=1):.2f}")
236
237    # Example single sample CI
238    example_sample = np.random.normal(true_mean, true_std, sample_size)
239    ex_mean = np.mean(example_sample)
240    ex_se = stats.sem(example_sample)
241    ex_ci = stats.t.interval(confidence_level, sample_size - 1,
242                            loc=ex_mean, scale=ex_se)
243
244    print(f"\nExample single sample:")
245    print(f"  Sample mean: {ex_mean:.2f}")
246    print(f"  Standard error: {ex_se:.2f}")
247    print(f"  95% CI: [{ex_ci[0]:.2f}, {ex_ci[1]:.2f}]")
248    print(f"  Contains true mean: {ex_ci[0] <= true_mean <= ex_ci[1]}")
249
250
251def bias_variance_estimators():
252    """Demonstrate bias and variance of estimators."""
253    print_section("5. Bias and Variance of Estimators")
254
255    # True population variance
256    true_var = 225  # std = 15
257    sample_size = 20
258    n_samples = 5000
259
260    print(f"True population variance: {true_var}")
261    print(f"Sample size: {sample_size}")
262    print(f"Number of samples: {n_samples}")
263
264    # Two estimators for variance
265    biased_vars = []
266    unbiased_vars = []
267
268    for _ in range(n_samples):
269        sample = np.random.normal(100, np.sqrt(true_var), sample_size)
270
271        # Biased estimator (divide by n)
272        biased_var = np.var(sample, ddof=0)
273        biased_vars.append(biased_var)
274
275        # Unbiased estimator (divide by n-1)
276        unbiased_var = np.var(sample, ddof=1)
277        unbiased_vars.append(unbiased_var)
278
279    biased_vars = np.array(biased_vars)
280    unbiased_vars = np.array(unbiased_vars)
281
282    print("\nBiased estimator (divide by n):")
283    print(f"  Mean: {np.mean(biased_vars):.2f}")
284    print(f"  Bias: {np.mean(biased_vars) - true_var:.2f}")
285    print(f"  Variance: {np.var(biased_vars, ddof=1):.2f}")
286    print(f"  MSE: {np.mean((biased_vars - true_var)**2):.2f}")
287
288    print("\nUnbiased estimator (divide by n-1):")
289    print(f"  Mean: {np.mean(unbiased_vars):.2f}")
290    print(f"  Bias: {np.mean(unbiased_vars) - true_var:.2f}")
291    print(f"  Variance: {np.var(unbiased_vars, ddof=1):.2f}")
292    print(f"  MSE: {np.mean((unbiased_vars - true_var)**2):.2f}")
293
294
295def mle_normal():
296    """Demonstrate Maximum Likelihood Estimation for normal distribution."""
297    print_section("6. MLE for Normal Distribution")
298
299    # True parameters
300    true_mu = 50
301    true_sigma = 10
302    sample_size = 100
303
304    # Generate sample
305    np.random.seed(123)
306    sample = np.random.normal(true_mu, true_sigma, sample_size)
307
308    print(f"True parameters: μ={true_mu}, σ={true_sigma}")
309    print(f"Sample size: {sample_size}")
310
311    # MLE estimates
312    mle_mu = np.mean(sample)
313    mle_sigma = np.std(sample, ddof=0)  # MLE uses n, not n-1
314
315    print(f"\nMLE estimates:")
316    print(f"  μ̂ = {mle_mu:.4f}")
317    print(f"  σ̂ = {mle_sigma:.4f}")
318
319    # Unbiased estimate for comparison
320    unbiased_sigma = np.std(sample, ddof=1)
321    print(f"\nUnbiased σ estimate: {unbiased_sigma:.4f}")
322
323    # Log-likelihood
324    log_likelihood = np.sum(stats.norm.logpdf(sample, mle_mu, mle_sigma))
325    print(f"\nLog-likelihood at MLE: {log_likelihood:.2f}")
326
327    # Verify MLE is maximum
328    print("\nLog-likelihood for different parameter values:")
329    mu_candidates = [mle_mu - 2, mle_mu - 1, mle_mu, mle_mu + 1, mle_mu + 2]
330    for mu_cand in mu_candidates:
331        ll = np.sum(stats.norm.logpdf(sample, mu_cand, mle_sigma))
332        marker = " <-- MLE" if abs(mu_cand - mle_mu) < 0.01 else ""
333        print(f"  μ={mu_cand:6.2f}: LL={ll:.2f}{marker}")
334
335
336def main():
337    """Run all demonstrations."""
338    print("=" * 70)
339    print("  SAMPLING AND ESTIMATION DEMONSTRATIONS")
340    print("=" * 70)
341
342    np.random.seed(42)
343
344    simple_random_sampling()
345    stratified_sampling()
346    bootstrap_estimation()
347    confidence_intervals()
348    bias_variance_estimators()
349    mle_normal()
350
351    print("\n" + "=" * 70)
352    print("  All demonstrations completed successfully!")
353    print("=" * 70)
354
355
356if __name__ == "__main__":
357    main()