1"""
202_sampling_estimation.py
3
4Demonstrates sampling methods and estimation techniques:
5- Simple random sampling
6- Stratified sampling
7- Bootstrap estimation
8- Confidence intervals
9- Bias and variance of estimators
10- Maximum Likelihood Estimation (MLE)
11"""
12
13import numpy as np
14from scipy import stats
15
16try:
17 import matplotlib.pyplot as plt
18 HAS_PLT = True
19except ImportError:
20 HAS_PLT = False
21 print("matplotlib not available; skipping plots\n")
22
23
24def print_section(title):
25 """Print formatted section header."""
26 print("\n" + "=" * 70)
27 print(f" {title}")
28 print("=" * 70)
29
30
31def simple_random_sampling():
32 """Demonstrate simple random sampling."""
33 print_section("1. Simple Random Sampling")
34
35 # Create population
36 population_size = 10000
37 population = np.random.normal(100, 15, population_size)
38
39 print(f"Population size: {population_size}")
40 print(f"Population mean: {np.mean(population):.2f}")
41 print(f"Population std: {np.std(population, ddof=0):.2f}")
42
43 # Different sample sizes
44 sample_sizes = [10, 50, 100, 500]
45 n_replications = 1000
46
47 print(f"\nSimple random sampling ({n_replications} replications):")
48
49 for n in sample_sizes:
50 sample_means = []
51 for _ in range(n_replications):
52 sample = np.random.choice(population, n, replace=False)
53 sample_means.append(np.mean(sample))
54
55 sample_means = np.array(sample_means)
56
57 print(f"\n Sample size n={n}:")
58 print(f" Mean of estimates: {np.mean(sample_means):.2f}")
59 print(f" Std of estimates: {np.std(sample_means, ddof=1):.2f}")
60 print(f" Theoretical SE: {np.std(population)/np.sqrt(n):.2f}")
61 print(f" Min estimate: {np.min(sample_means):.2f}")
62 print(f" Max estimate: {np.max(sample_means):.2f}")
63
64
65def stratified_sampling():
66 """Demonstrate stratified sampling."""
67 print_section("2. Stratified Sampling")
68
69 # Create stratified population
70 # Stratum 1: 60%, mean=90
71 # Stratum 2: 30%, mean=110
72 # Stratum 3: 10%, mean=130
73
74 n1, n2, n3 = 6000, 3000, 1000
75 stratum1 = np.random.normal(90, 12, n1)
76 stratum2 = np.random.normal(110, 15, n2)
77 stratum3 = np.random.normal(130, 18, n3)
78
79 population = np.concatenate([stratum1, stratum2, stratum3])
80
81 print("Population composition:")
82 print(f" Stratum 1: n={n1}, mean={np.mean(stratum1):.2f}")
83 print(f" Stratum 2: n={n2}, mean={np.mean(stratum2):.2f}")
84 print(f" Stratum 3: n={n3}, mean={np.mean(stratum3):.2f}")
85 print(f" Overall mean: {np.mean(population):.2f}")
86
87 # Compare simple vs stratified sampling
88 total_sample_size = 300
89 n_replications = 1000
90
91 simple_means = []
92 stratified_means = []
93
94 for _ in range(n_replications):
95 # Simple random sampling
96 simple_sample = np.random.choice(population, total_sample_size, replace=False)
97 simple_means.append(np.mean(simple_sample))
98
99 # Stratified sampling (proportional allocation)
100 s1_sample = np.random.choice(stratum1, 180, replace=False) # 60%
101 s2_sample = np.random.choice(stratum2, 90, replace=False) # 30%
102 s3_sample = np.random.choice(stratum3, 30, replace=False) # 10%
103 stratified_sample = np.concatenate([s1_sample, s2_sample, s3_sample])
104 stratified_means.append(np.mean(stratified_sample))
105
106 simple_means = np.array(simple_means)
107 stratified_means = np.array(stratified_means)
108
109 print(f"\nComparison (n={total_sample_size}, {n_replications} replications):")
110 print(f"\n Simple Random Sampling:")
111 print(f" Mean: {np.mean(simple_means):.2f}")
112 print(f" Std: {np.std(simple_means, ddof=1):.2f}")
113
114 print(f"\n Stratified Sampling:")
115 print(f" Mean: {np.mean(stratified_means):.2f}")
116 print(f" Std: {np.std(stratified_means, ddof=1):.2f}")
117
118 efficiency = np.var(simple_means, ddof=1) / np.var(stratified_means, ddof=1)
119 print(f"\n Efficiency gain: {efficiency:.2f}x")
120 print(f" Variance reduction: {(1 - 1/efficiency)*100:.1f}%")
121
122
123def bootstrap_estimation():
124 """Demonstrate bootstrap estimation."""
125 print_section("3. Bootstrap Estimation")
126
127 # Original sample
128 np.random.seed(42)
129 sample = np.random.lognormal(3, 0.5, 100)
130
131 print(f"Original sample size: {len(sample)}")
132 print(f"Sample mean: {np.mean(sample):.2f}")
133 print(f"Sample median: {np.median(sample):.2f}")
134 print(f"Sample std: {np.std(sample, ddof=1):.2f}")
135
136 # Bootstrap
137 n_bootstrap = 5000
138 bootstrap_means = []
139 bootstrap_medians = []
140 bootstrap_stds = []
141
142 for _ in range(n_bootstrap):
143 bootstrap_sample = np.random.choice(sample, len(sample), replace=True)
144 bootstrap_means.append(np.mean(bootstrap_sample))
145 bootstrap_medians.append(np.median(bootstrap_sample))
146 bootstrap_stds.append(np.std(bootstrap_sample, ddof=1))
147
148 bootstrap_means = np.array(bootstrap_means)
149 bootstrap_medians = np.array(bootstrap_medians)
150 bootstrap_stds = np.array(bootstrap_stds)
151
152 print(f"\nBootstrap results ({n_bootstrap} resamples):")
153
154 print(f"\n Mean:")
155 print(f" Bootstrap mean: {np.mean(bootstrap_means):.2f}")
156 print(f" Bootstrap SE: {np.std(bootstrap_means, ddof=1):.2f}")
157 print(f" 95% CI: [{np.percentile(bootstrap_means, 2.5):.2f}, "
158 f"{np.percentile(bootstrap_means, 97.5):.2f}]")
159
160 print(f"\n Median:")
161 print(f" Bootstrap mean: {np.mean(bootstrap_medians):.2f}")
162 print(f" Bootstrap SE: {np.std(bootstrap_medians, ddof=1):.2f}")
163 print(f" 95% CI: [{np.percentile(bootstrap_medians, 2.5):.2f}, "
164 f"{np.percentile(bootstrap_medians, 97.5):.2f}]")
165
166 if HAS_PLT:
167 fig, axes = plt.subplots(1, 3, figsize=(14, 4))
168
169 axes[0].hist(bootstrap_means, bins=50, alpha=0.7, edgecolor='black')
170 axes[0].axvline(np.mean(sample), color='red', linestyle='--', label='Original')
171 axes[0].set_xlabel('Mean')
172 axes[0].set_title('Bootstrap Distribution of Mean')
173 axes[0].legend()
174 axes[0].grid(True, alpha=0.3)
175
176 axes[1].hist(bootstrap_medians, bins=50, alpha=0.7, edgecolor='black', color='green')
177 axes[1].axvline(np.median(sample), color='red', linestyle='--', label='Original')
178 axes[1].set_xlabel('Median')
179 axes[1].set_title('Bootstrap Distribution of Median')
180 axes[1].legend()
181 axes[1].grid(True, alpha=0.3)
182
183 axes[2].hist(bootstrap_stds, bins=50, alpha=0.7, edgecolor='black', color='orange')
184 axes[2].axvline(np.std(sample, ddof=1), color='red', linestyle='--', label='Original')
185 axes[2].set_xlabel('Std Dev')
186 axes[2].set_title('Bootstrap Distribution of Std Dev')
187 axes[2].legend()
188 axes[2].grid(True, alpha=0.3)
189
190 plt.tight_layout()
191 plt.savefig('/tmp/bootstrap_demo.png', dpi=100)
192 print("\n[Plot saved to /tmp/bootstrap_demo.png]")
193 plt.close()
194
195
196def confidence_intervals():
197 """Demonstrate confidence interval construction."""
198 print_section("4. Confidence Intervals")
199
200 # True population
201 true_mean = 100
202 true_std = 15
203 sample_size = 50
204 n_samples = 1000
205 confidence_level = 0.95
206
207 print(f"Population: N({true_mean}, {true_std}²)")
208 print(f"Sample size: {sample_size}")
209 print(f"Confidence level: {confidence_level*100}%")
210 print(f"Number of samples: {n_samples}")
211
212 # Generate samples and compute CIs
213 coverage_count = 0
214 ci_widths = []
215
216 for _ in range(n_samples):
217 sample = np.random.normal(true_mean, true_std, sample_size)
218 sample_mean = np.mean(sample)
219 sample_se = stats.sem(sample)
220
221 # t-based CI
222 ci = stats.t.interval(confidence_level, sample_size - 1,
223 loc=sample_mean, scale=sample_se)
224 ci_widths.append(ci[1] - ci[0])
225
226 if ci[0] <= true_mean <= ci[1]:
227 coverage_count += 1
228
229 coverage_rate = coverage_count / n_samples
230
231 print(f"\nResults:")
232 print(f" Coverage rate: {coverage_rate*100:.1f}%")
233 print(f" Expected coverage: {confidence_level*100}%")
234 print(f" Average CI width: {np.mean(ci_widths):.2f}")
235 print(f" CI width std: {np.std(ci_widths, ddof=1):.2f}")
236
237 # Example single sample CI
238 example_sample = np.random.normal(true_mean, true_std, sample_size)
239 ex_mean = np.mean(example_sample)
240 ex_se = stats.sem(example_sample)
241 ex_ci = stats.t.interval(confidence_level, sample_size - 1,
242 loc=ex_mean, scale=ex_se)
243
244 print(f"\nExample single sample:")
245 print(f" Sample mean: {ex_mean:.2f}")
246 print(f" Standard error: {ex_se:.2f}")
247 print(f" 95% CI: [{ex_ci[0]:.2f}, {ex_ci[1]:.2f}]")
248 print(f" Contains true mean: {ex_ci[0] <= true_mean <= ex_ci[1]}")
249
250
251def bias_variance_estimators():
252 """Demonstrate bias and variance of estimators."""
253 print_section("5. Bias and Variance of Estimators")
254
255 # True population variance
256 true_var = 225 # std = 15
257 sample_size = 20
258 n_samples = 5000
259
260 print(f"True population variance: {true_var}")
261 print(f"Sample size: {sample_size}")
262 print(f"Number of samples: {n_samples}")
263
264 # Two estimators for variance
265 biased_vars = []
266 unbiased_vars = []
267
268 for _ in range(n_samples):
269 sample = np.random.normal(100, np.sqrt(true_var), sample_size)
270
271 # Biased estimator (divide by n)
272 biased_var = np.var(sample, ddof=0)
273 biased_vars.append(biased_var)
274
275 # Unbiased estimator (divide by n-1)
276 unbiased_var = np.var(sample, ddof=1)
277 unbiased_vars.append(unbiased_var)
278
279 biased_vars = np.array(biased_vars)
280 unbiased_vars = np.array(unbiased_vars)
281
282 print("\nBiased estimator (divide by n):")
283 print(f" Mean: {np.mean(biased_vars):.2f}")
284 print(f" Bias: {np.mean(biased_vars) - true_var:.2f}")
285 print(f" Variance: {np.var(biased_vars, ddof=1):.2f}")
286 print(f" MSE: {np.mean((biased_vars - true_var)**2):.2f}")
287
288 print("\nUnbiased estimator (divide by n-1):")
289 print(f" Mean: {np.mean(unbiased_vars):.2f}")
290 print(f" Bias: {np.mean(unbiased_vars) - true_var:.2f}")
291 print(f" Variance: {np.var(unbiased_vars, ddof=1):.2f}")
292 print(f" MSE: {np.mean((unbiased_vars - true_var)**2):.2f}")
293
294
295def mle_normal():
296 """Demonstrate Maximum Likelihood Estimation for normal distribution."""
297 print_section("6. MLE for Normal Distribution")
298
299 # True parameters
300 true_mu = 50
301 true_sigma = 10
302 sample_size = 100
303
304 # Generate sample
305 np.random.seed(123)
306 sample = np.random.normal(true_mu, true_sigma, sample_size)
307
308 print(f"True parameters: μ={true_mu}, σ={true_sigma}")
309 print(f"Sample size: {sample_size}")
310
311 # MLE estimates
312 mle_mu = np.mean(sample)
313 mle_sigma = np.std(sample, ddof=0) # MLE uses n, not n-1
314
315 print(f"\nMLE estimates:")
316 print(f" μ̂ = {mle_mu:.4f}")
317 print(f" σ̂ = {mle_sigma:.4f}")
318
319 # Unbiased estimate for comparison
320 unbiased_sigma = np.std(sample, ddof=1)
321 print(f"\nUnbiased σ estimate: {unbiased_sigma:.4f}")
322
323 # Log-likelihood
324 log_likelihood = np.sum(stats.norm.logpdf(sample, mle_mu, mle_sigma))
325 print(f"\nLog-likelihood at MLE: {log_likelihood:.2f}")
326
327 # Verify MLE is maximum
328 print("\nLog-likelihood for different parameter values:")
329 mu_candidates = [mle_mu - 2, mle_mu - 1, mle_mu, mle_mu + 1, mle_mu + 2]
330 for mu_cand in mu_candidates:
331 ll = np.sum(stats.norm.logpdf(sample, mu_cand, mle_sigma))
332 marker = " <-- MLE" if abs(mu_cand - mle_mu) < 0.01 else ""
333 print(f" μ={mu_cand:6.2f}: LL={ll:.2f}{marker}")
334
335
336def main():
337 """Run all demonstrations."""
338 print("=" * 70)
339 print(" SAMPLING AND ESTIMATION DEMONSTRATIONS")
340 print("=" * 70)
341
342 np.random.seed(42)
343
344 simple_random_sampling()
345 stratified_sampling()
346 bootstrap_estimation()
347 confidence_intervals()
348 bias_variance_estimators()
349 mle_normal()
350
351 print("\n" + "=" * 70)
352 print(" All demonstrations completed successfully!")
353 print("=" * 70)
354
355
356if __name__ == "__main__":
357 main()