12. Sampling and Estimation
12. Sampling and Estimation¶
Previous: Probability Review | Next: Confidence Intervals
Overview¶
The core goal of statistics is to infer characteristics of a population through a sample. This chapter covers the concept of sampling distributions and point estimation methods, particularly Maximum Likelihood Estimation (MLE) and the Method of Moments (MoM).
1. Population and Sample¶
1.1 Basic Concepts¶
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
# Population: All objects of interest
# Sample: A portion extracted from the population
# Example: Height of adult males in South Korea (population)
population_mean = 173.5 # Population mean μ
population_std = 6.0 # Population standard deviation σ
# In practice, we cannot know the entire population
# We estimate parameters through samples
# Sample extraction (simple random sampling)
np.random.seed(42)
sample_size = 100
sample = np.random.normal(population_mean, population_std, sample_size)
print("Population parameters (unknown in practice):")
print(f" Population mean (μ): {population_mean}")
print(f" Population std (σ): {population_std}")
print(f"\nSample statistics (n={sample_size}):")
print(f" Sample mean (x̄): {sample.mean():.2f}")
print(f" Sample std (s): {sample.std(ddof=1):.2f}") # ddof=1 for sample std
1.2 Sampling Methods¶
# Various sampling methods
# 1. Simple Random Sampling
np.random.seed(42)
population = np.arange(1, 1001) # Population from 1 to 1000
simple_random_sample = np.random.choice(population, size=50, replace=False)
print(f"Simple random sampling: {simple_random_sample[:10]}...")
# 2. Stratified Sampling
# Divide population into strata and sample from each
strata_A = np.random.normal(50, 10, 500) # Stratum A
strata_B = np.random.normal(80, 15, 500) # Stratum B
# Proportional sampling from each stratum
sample_A = np.random.choice(strata_A, size=25, replace=False)
sample_B = np.random.choice(strata_B, size=25, replace=False)
stratified_sample = np.concatenate([sample_A, sample_B])
print(f"\nStratified sampling:")
print(f" Stratum A sample mean: {sample_A.mean():.2f}")
print(f" Stratum B sample mean: {sample_B.mean():.2f}")
print(f" Overall sample mean: {stratified_sample.mean():.2f}")
# 3. Systematic Sampling
# Sample every k-th element
k = 20 # Sampling interval
start = np.random.randint(0, k)
systematic_sample = population[start::k]
print(f"\nSystematic sampling (k={k}): {systematic_sample[:5]}...")
2. Sampling Distribution¶
2.1 Distribution of Sample Mean¶
# Simulation of sampling distribution of sample mean
np.random.seed(42)
# Population parameters
mu = 100
sigma = 20
# Various sample sizes
sample_sizes = [5, 30, 100]
n_simulations = 10000
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for i, n in enumerate(sample_sizes):
# Draw many samples and calculate their means
sample_means = []
for _ in range(n_simulations):
sample = np.random.normal(mu, sigma, n)
sample_means.append(sample.mean())
sample_means = np.array(sample_means)
# Visualization
axes[i].hist(sample_means, bins=50, density=True, alpha=0.7, color='steelblue')
# Theoretical distribution: X̄ ~ N(μ, σ²/n)
theoretical_std = sigma / np.sqrt(n)
x = np.linspace(mu - 4*theoretical_std, mu + 4*theoretical_std, 100)
axes[i].plot(x, stats.norm.pdf(x, mu, theoretical_std), 'r-', lw=2)
axes[i].axvline(mu, color='k', linestyle='--', alpha=0.5)
axes[i].set_xlabel('Sample mean')
axes[i].set_ylabel('Density')
axes[i].set_title(f'n = {n}\nSD(X̄) = σ/√n = {theoretical_std:.2f}')
plt.suptitle('Sampling Distribution of Sample Mean', fontsize=14)
plt.tight_layout()
plt.show()
# Numerical verification
print("\nCharacteristics of sample mean distribution:")
for n in sample_sizes:
sample_means = [np.random.normal(mu, sigma, n).mean() for _ in range(n_simulations)]
print(f"n={n:3d}: E[X̄]={np.mean(sample_means):.2f}, SD(X̄)={np.std(sample_means):.2f}, "
f"Theoretical={sigma/np.sqrt(n):.2f}")
2.2 Standard Error¶
# Standard error = standard deviation of sample statistic
# Standard error of sample mean: SE(X̄) = σ/√n (or s/√n)
def standard_error(sample):
"""Calculate standard error of sample mean"""
n = len(sample)
s = np.std(sample, ddof=1) # Sample standard deviation
se = s / np.sqrt(n)
return se
# Example
np.random.seed(42)
sample_sizes = [10, 30, 100, 500]
print("Standard error by sample size:")
print("-" * 50)
for n in sample_sizes:
sample = np.random.normal(100, 20, n)
se = standard_error(sample)
theoretical_se = 20 / np.sqrt(n)
print(f"n = {n:4d}: SE = {se:.3f}, Theoretical = {theoretical_se:.3f}")
# Visualization
fig, ax = plt.subplots(figsize=(10, 5))
n_range = np.arange(10, 501)
se_values = 20 / np.sqrt(n_range)
ax.plot(n_range, se_values, 'b-', linewidth=2)
ax.fill_between(n_range, 0, se_values, alpha=0.2)
ax.set_xlabel('Sample size (n)')
ax.set_ylabel('Standard error (SE)')
ax.set_title('Relationship between sample size and standard error: SE = σ/√n')
ax.grid(alpha=0.3)
# Mark specific points
for n in [30, 100, 400]:
se = 20 / np.sqrt(n)
ax.scatter([n], [se], color='red', s=100, zorder=5)
ax.annotate(f'n={n}, SE={se:.2f}', (n, se), xytext=(n+20, se+0.5))
plt.show()
2.3 Distribution of Other Sample Statistics¶
# Distribution of sample variance
np.random.seed(42)
# For normal population (n-1)S²/σ² ~ χ²(n-1)
n = 20
sigma_squared = 100 # Population variance
n_simulations = 10000
chi_squared_values = []
for _ in range(n_simulations):
sample = np.random.normal(0, np.sqrt(sigma_squared), n)
s_squared = np.var(sample, ddof=1) # Sample variance
chi_squared = (n - 1) * s_squared / sigma_squared
chi_squared_values.append(chi_squared)
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Sample variance distribution
sample_variances = [np.var(np.random.normal(0, 10, n), ddof=1) for _ in range(n_simulations)]
axes[0].hist(sample_variances, bins=50, density=True, alpha=0.7)
axes[0].axvline(sigma_squared, color='r', linestyle='--', label=f'σ² = {sigma_squared}')
axes[0].set_xlabel('Sample variance (S²)')
axes[0].set_ylabel('Density')
axes[0].set_title('Distribution of Sample Variance')
axes[0].legend()
# Chi-squared transformation
axes[1].hist(chi_squared_values, bins=50, density=True, alpha=0.7, label='Simulation')
x = np.linspace(0, 50, 100)
axes[1].plot(x, stats.chi2.pdf(x, df=n-1), 'r-', lw=2, label=f'χ²({n-1})')
axes[1].set_xlabel('(n-1)S²/σ²')
axes[1].set_ylabel('Density')
axes[1].set_title('Chi-squared Transformation of Sample Variance')
axes[1].legend()
plt.tight_layout()
plt.show()
print(f"Simulation mean: {np.mean(chi_squared_values):.2f}, Theoretical (df=n-1): {n-1}")
print(f"Simulation variance: {np.var(chi_squared_values):.2f}, Theoretical (2*df): {2*(n-1)}")
3. Point Estimation¶
3.1 Properties of Estimators¶
# Properties of good estimators: Unbiasedness, Consistency, Efficiency
# 1. Unbiasedness: E[θ̂] = θ
# Sample mean is an unbiased estimator of population mean
# Sample variance (divided by n-1) is an unbiased estimator of population variance
np.random.seed(42)
mu, sigma = 50, 10
n = 30
n_simulations = 10000
# Unbiasedness of sample mean
sample_means = [np.random.normal(mu, sigma, n).mean() for _ in range(n_simulations)]
print(f"Expected value of sample mean: {np.mean(sample_means):.4f}, Population mean: {mu}")
print(f"→ Sample mean is an unbiased estimator")
# Comparison of sample variance (divide by n vs n-1)
var_n = [] # Divide by n (biased)
var_n_1 = [] # Divide by n-1 (unbiased)
for _ in range(n_simulations):
sample = np.random.normal(mu, sigma, n)
var_n.append(np.var(sample, ddof=0))
var_n_1.append(np.var(sample, ddof=1))
print(f"\nddof=0 (divide by n) expected value: {np.mean(var_n):.2f}")
print(f"ddof=1 (divide by n-1) expected value: {np.mean(var_n_1):.2f}")
print(f"Population variance: {sigma**2}")
print(f"→ Sample variance divided by n-1 is unbiased")
# 2. Consistency: θ̂ → θ as n → ∞
np.random.seed(42)
mu = 100
sample_sizes = [10, 50, 100, 500, 1000, 5000]
mean_estimates = []
var_estimates = []
for n in sample_sizes:
sample = np.random.normal(mu, 20, n)
mean_estimates.append(sample.mean())
var_estimates.append(np.var(sample, ddof=1))
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(sample_sizes, mean_estimates, 'bo-')
axes[0].axhline(mu, color='r', linestyle='--', label=f'μ = {mu}')
axes[0].set_xlabel('Sample size (n)')
axes[0].set_ylabel('Sample mean')
axes[0].set_title('Consistency: Sample mean → Population mean')
axes[0].legend()
axes[0].set_xscale('log')
axes[1].plot(sample_sizes, var_estimates, 'go-')
axes[1].axhline(400, color='r', linestyle='--', label='σ² = 400')
axes[1].set_xlabel('Sample size (n)')
axes[1].set_ylabel('Sample variance')
axes[1].set_title('Consistency: Sample variance → Population variance')
axes[1].legend()
axes[1].set_xscale('log')
plt.tight_layout()
plt.show()
# 3. Efficiency: More efficient when variance is smaller
# Estimating mean in normal distribution: sample mean vs sample median
np.random.seed(42)
n = 30
n_simulations = 10000
mu = 50
means = []
medians = []
for _ in range(n_simulations):
sample = np.random.normal(mu, 10, n)
means.append(sample.mean())
medians.append(np.median(sample))
print("Efficiency comparison (estimating population mean in normal distribution):")
print(f"Sample mean: Var = {np.var(means):.4f}")
print(f"Sample median: Var = {np.var(medians):.4f}")
print(f"Relative efficiency: {np.var(means) / np.var(medians):.4f}")
print("→ Sample mean is more efficient in normal distribution (ARE ≈ π/2 ≈ 1.57)")
3.2 Bias-Variance Tradeoff¶
# MSE = Bias² + Variance
def mse_analysis(estimates, true_value):
"""MSE decomposition"""
estimates = np.array(estimates)
bias = np.mean(estimates) - true_value
variance = np.var(estimates)
mse = np.mean((estimates - true_value)**2)
return bias, variance, mse
# Example: Variance estimation (n vs n-1)
np.random.seed(42)
n = 10
sigma_squared = 100
n_simulations = 10000
var_n = [np.var(np.random.normal(0, 10, n), ddof=0) for _ in range(n_simulations)]
var_n_1 = [np.var(np.random.normal(0, 10, n), ddof=1) for _ in range(n_simulations)]
print("MSE analysis of variance estimators:")
print("-" * 60)
print(f"{'Estimator':<15} {'Bias':<12} {'Variance':<12} {'MSE':<12}")
print("-" * 60)
for name, estimates in [("Divide by n", var_n), ("Divide by n-1", var_n_1)]:
bias, var, mse = mse_analysis(estimates, sigma_squared)
print(f"{name:<15} {bias:<12.4f} {var:<12.4f} {mse:<12.4f}")
# Visualization
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(var_n, bins=50, alpha=0.5, label=f'Divide by n (biased)', density=True)
ax.hist(var_n_1, bins=50, alpha=0.5, label=f'Divide by n-1 (unbiased)', density=True)
ax.axvline(sigma_squared, color='k', linestyle='--', linewidth=2, label=f'σ² = {sigma_squared}')
ax.set_xlabel('Estimate')
ax.set_ylabel('Density')
ax.set_title('Bias-Variance Tradeoff')
ax.legend()
plt.show()
4. Maximum Likelihood Estimation (MLE)¶
4.1 MLE Concept¶
# Likelihood function: L(θ|x) = P(X=x|θ)
# MLE: Find θ that maximizes L(θ)
# Example: MLE for Bernoulli distribution
def bernoulli_likelihood(p, data):
"""Likelihood function of Bernoulli distribution"""
n = len(data)
k = sum(data) # Number of successes
return (p ** k) * ((1 - p) ** (n - k))
def bernoulli_log_likelihood(p, data):
"""Log-likelihood function of Bernoulli distribution"""
n = len(data)
k = sum(data)
if p <= 0 or p >= 1:
return -np.inf
return k * np.log(p) + (n - k) * np.log(1 - p)
# Generate data
np.random.seed(42)
true_p = 0.3
data = np.random.binomial(1, true_p, size=50)
print(f"True p = {true_p}, Observed success rate = {data.mean():.2f}")
# Visualize likelihood function
p_values = np.linspace(0.01, 0.99, 100)
likelihoods = [bernoulli_likelihood(p, data) for p in p_values]
log_likelihoods = [bernoulli_log_likelihood(p, data) for p in p_values]
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(p_values, likelihoods, 'b-', linewidth=2)
axes[0].axvline(data.mean(), color='r', linestyle='--', label=f'MLE p̂ = {data.mean():.2f}')
axes[0].axvline(true_p, color='g', linestyle=':', label=f'True p = {true_p}')
axes[0].set_xlabel('p')
axes[0].set_ylabel('L(p)')
axes[0].set_title('Likelihood Function')
axes[0].legend()
axes[1].plot(p_values, log_likelihoods, 'b-', linewidth=2)
axes[1].axvline(data.mean(), color='r', linestyle='--', label=f'MLE p̂ = {data.mean():.2f}')
axes[1].axvline(true_p, color='g', linestyle=':', label=f'True p = {true_p}')
axes[1].set_xlabel('p')
axes[1].set_ylabel('log L(p)')
axes[1].set_title('Log-Likelihood Function')
axes[1].legend()
plt.tight_layout()
plt.show()
print(f"\nBernoulli MLE: p̂ = x̄ = {data.mean():.4f}")
4.2 MLE for Normal Distribution¶
from scipy.optimize import minimize
# MLE for normal distribution N(μ, σ²)
# MLE: μ̂ = x̄, σ̂² = (1/n)Σ(x_i - x̄)²
def normal_neg_log_likelihood(params, data):
"""Negative log-likelihood function of normal distribution"""
mu, sigma = params
if sigma <= 0:
return np.inf
n = len(data)
ll = -n/2 * np.log(2*np.pi) - n*np.log(sigma) - np.sum((data - mu)**2) / (2*sigma**2)
return -ll
# Generate data
np.random.seed(42)
true_mu, true_sigma = 50, 10
data = np.random.normal(true_mu, true_sigma, 100)
# Numerical optimization
initial_guess = [0, 1]
result = minimize(normal_neg_log_likelihood, initial_guess, args=(data,),
method='L-BFGS-B', bounds=[(-np.inf, np.inf), (0.001, np.inf)])
mu_mle, sigma_mle = result.x
# Compare with analytical solution
mu_analytical = data.mean()
sigma_analytical = np.std(data, ddof=0) # MLE divides by n
print("Normal distribution MLE:")
print("-" * 50)
print(f"True parameters: μ = {true_mu}, σ = {true_sigma}")
print(f"Analytical MLE: μ̂ = {mu_analytical:.4f}, σ̂ = {sigma_analytical:.4f}")
print(f"Numerical MLE: μ̂ = {mu_mle:.4f}, σ̂ = {sigma_mle:.4f}")
# Using scipy.stats
mu_fit, sigma_fit = stats.norm.fit(data)
print(f"scipy.stats: μ̂ = {mu_fit:.4f}, σ̂ = {sigma_fit:.4f}")
4.3 MLE for Poisson Distribution¶
# MLE for Poisson distribution Poisson(λ)
# MLE: λ̂ = x̄
def poisson_neg_log_likelihood(lam, data):
"""Negative log-likelihood function of Poisson distribution"""
if lam <= 0:
return np.inf
return -np.sum(stats.poisson.logpmf(data, lam))
# Generate data
np.random.seed(42)
true_lambda = 5
data = np.random.poisson(true_lambda, 100)
# Numerical optimization
result = minimize(poisson_neg_log_likelihood, x0=[1], args=(data,),
method='L-BFGS-B', bounds=[(0.001, np.inf)])
lambda_mle = result.x[0]
lambda_analytical = data.mean()
print("Poisson distribution MLE:")
print(f"True λ = {true_lambda}")
print(f"Analytical MLE: λ̂ = {lambda_analytical:.4f}")
print(f"Numerical MLE: λ̂ = {lambda_mle:.4f}")
# Visualization
fig, ax = plt.subplots(figsize=(10, 5))
x_vals = np.arange(0, 15)
ax.bar(x_vals - 0.2, np.bincount(data, minlength=15)[:15] / len(data),
width=0.4, alpha=0.7, label='Observed frequency')
ax.bar(x_vals + 0.2, stats.poisson.pmf(x_vals, lambda_mle),
width=0.4, alpha=0.7, label=f'MLE Poisson(λ̂={lambda_mle:.2f})')
ax.set_xlabel('x')
ax.set_ylabel('Probability/Frequency')
ax.set_title('Poisson Distribution MLE Fitting')
ax.legend()
ax.set_xticks(x_vals)
plt.show()
4.4 MLE for Exponential Distribution¶
# MLE for exponential distribution Exp(λ)
# MLE: λ̂ = 1/x̄
def exponential_neg_log_likelihood(lam, data):
"""Negative log-likelihood function of exponential distribution"""
if lam <= 0:
return np.inf
n = len(data)
return -n * np.log(lam) + lam * np.sum(data)
# Generate data
np.random.seed(42)
true_lambda = 0.5
data = np.random.exponential(scale=1/true_lambda, size=100)
# MLE
lambda_mle = 1 / data.mean()
print("Exponential distribution MLE:")
print(f"True λ = {true_lambda}")
print(f"MLE: λ̂ = 1/x̄ = {lambda_mle:.4f}")
# Visualization
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(data, bins=30, density=True, alpha=0.7, label='Data')
x = np.linspace(0, data.max(), 100)
ax.plot(x, stats.expon.pdf(x, scale=1/lambda_mle), 'r-', lw=2,
label=f'MLE Exp(λ̂={lambda_mle:.3f})')
ax.plot(x, stats.expon.pdf(x, scale=1/true_lambda), 'g--', lw=2,
label=f'True Exp(λ={true_lambda})')
ax.set_xlabel('x')
ax.set_ylabel('Density')
ax.set_title('Exponential Distribution MLE Fitting')
ax.legend()
plt.show()
4.5 Asymptotic Properties of MLE¶
# Asymptotic normality of MLE
# √n(θ̂_MLE - θ) → N(0, I(θ)^{-1}) (Fisher information)
# Simulation: Asymptotic distribution of MLE for normal mean
np.random.seed(42)
true_mu = 50
true_sigma = 10
sample_sizes = [10, 30, 100, 500]
n_simulations = 5000
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
for i, n in enumerate(sample_sizes):
mle_estimates = []
for _ in range(n_simulations):
sample = np.random.normal(true_mu, true_sigma, n)
mle_estimates.append(sample.mean())
# Standardize
standardized = np.sqrt(n) * (np.array(mle_estimates) - true_mu) / true_sigma
axes[i].hist(standardized, bins=50, density=True, alpha=0.7)
x = np.linspace(-4, 4, 100)
axes[i].plot(x, stats.norm.pdf(x, 0, 1), 'r-', lw=2, label='N(0,1)')
axes[i].set_xlabel('√n(μ̂ - μ)/σ')
axes[i].set_ylabel('Density')
axes[i].set_title(f'n = {n}')
axes[i].legend()
axes[i].set_xlim(-4, 4)
plt.suptitle('Asymptotic Normality of MLE', fontsize=14)
plt.tight_layout()
plt.show()
5. Method of Moments (MoM)¶
5.1 Method of Moments Concept¶
# Method of Moments: Estimate parameters by setting sample moments = population moments
# k-th sample moment: m_k = (1/n)Σx_i^k
# k-th central moment: m'_k = (1/n)Σ(x_i - x̄)^k
def sample_moments(data, k):
"""Calculate k-th sample moment"""
return np.mean(data ** k)
def sample_central_moments(data, k):
"""Calculate k-th sample central moment"""
return np.mean((data - data.mean()) ** k)
# Example data
np.random.seed(42)
data = np.random.gamma(shape=5, scale=2, size=1000)
print("Sample moments:")
for k in range(1, 5):
print(f" {k}-th moment: {sample_moments(data, k):.4f}")
print("\nSample central moments:")
for k in range(2, 5):
print(f" {k}-th central moment: {sample_central_moments(data, k):.4f}")
5.2 Method of Moments for Gamma Distribution¶
# Gamma distribution Gamma(α, β): E[X] = α/β, Var(X) = α/β²
# MoM: α̂ = x̄² / s², β̂ = x̄ / s²
def gamma_mom_estimator(data):
"""Method of moments estimator for gamma distribution"""
x_bar = data.mean()
s_squared = data.var(ddof=1)
alpha_hat = x_bar ** 2 / s_squared
beta_hat = x_bar / s_squared
return alpha_hat, beta_hat
# Generate data
np.random.seed(42)
true_alpha, true_beta = 5, 2
data = np.random.gamma(shape=true_alpha, scale=1/true_beta, size=500)
# Method of moments
alpha_mom, beta_mom = gamma_mom_estimator(data)
# MLE (scipy)
alpha_mle, loc_mle, scale_mle = stats.gamma.fit(data, floc=0)
beta_mle = 1 / scale_mle
print("Gamma distribution parameter estimation:")
print("-" * 50)
print(f"{'Method':<15} {'α':<12} {'β':<12}")
print("-" * 50)
print(f"{'True':<15} {true_alpha:<12} {true_beta:<12}")
print(f"{'MoM':<15} {alpha_mom:<12.4f} {beta_mom:<12.4f}")
print(f"{'MLE':<15} {alpha_mle:<12.4f} {beta_mle:<12.4f}")
# Visualization
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(data, bins=50, density=True, alpha=0.7, label='Data')
x = np.linspace(0, data.max(), 100)
ax.plot(x, stats.gamma.pdf(x, a=true_alpha, scale=1/true_beta), 'g--', lw=2,
label=f'True: α={true_alpha}, β={true_beta}')
ax.plot(x, stats.gamma.pdf(x, a=alpha_mom, scale=1/beta_mom), 'r-', lw=2,
label=f'MoM: α={alpha_mom:.2f}, β={beta_mom:.2f}')
ax.plot(x, stats.gamma.pdf(x, a=alpha_mle, scale=1/beta_mle), 'b:', lw=2,
label=f'MLE: α={alpha_mle:.2f}, β={beta_mle:.2f}')
ax.set_xlabel('x')
ax.set_ylabel('Density')
ax.set_title('Gamma Distribution Estimation: MoM vs MLE')
ax.legend()
plt.show()
5.3 Method of Moments for Beta Distribution¶
# Beta distribution Beta(α, β): E[X] = α/(α+β), Var(X) = αβ/((α+β)²(α+β+1))
# Derive MoM
def beta_mom_estimator(data):
"""Method of moments estimator for beta distribution"""
x_bar = data.mean()
s_squared = data.var(ddof=1)
# Solution to moment equations
temp = (x_bar * (1 - x_bar) / s_squared) - 1
alpha_hat = x_bar * temp
beta_hat = (1 - x_bar) * temp
return alpha_hat, beta_hat
# Generate data
np.random.seed(42)
true_alpha, true_beta = 2, 5
data = np.random.beta(true_alpha, true_beta, size=500)
# Method of moments
alpha_mom, beta_mom = beta_mom_estimator(data)
# MLE (scipy)
alpha_mle, beta_mle, loc_mle, scale_mle = stats.beta.fit(data, floc=0, fscale=1)
print("Beta distribution parameter estimation:")
print("-" * 50)
print(f"{'Method':<15} {'α':<12} {'β':<12}")
print("-" * 50)
print(f"{'True':<15} {true_alpha:<12} {true_beta:<12}")
print(f"{'MoM':<15} {alpha_mom:<12.4f} {beta_mom:<12.4f}")
print(f"{'MLE':<15} {alpha_mle:<12.4f} {beta_mle:<12.4f}")
# Visualization
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(data, bins=30, density=True, alpha=0.7, label='Data')
x = np.linspace(0.001, 0.999, 100)
ax.plot(x, stats.beta.pdf(x, true_alpha, true_beta), 'g--', lw=2,
label=f'True: α={true_alpha}, β={true_beta}')
ax.plot(x, stats.beta.pdf(x, alpha_mom, beta_mom), 'r-', lw=2,
label=f'MoM: α={alpha_mom:.2f}, β={beta_mom:.2f}')
ax.plot(x, stats.beta.pdf(x, alpha_mle, beta_mle), 'b:', lw=2,
label=f'MLE: α={alpha_mle:.2f}, β={beta_mle:.2f}')
ax.set_xlabel('x')
ax.set_ylabel('Density')
ax.set_title('Beta Distribution Estimation: MoM vs MLE')
ax.legend()
plt.show()
5.4 MLE vs MoM Comparison¶
# Simulation comparing MLE vs MoM
np.random.seed(42)
true_alpha, true_beta = 3, 2
sample_sizes = [20, 50, 100, 500]
n_simulations = 1000
results = []
for n in sample_sizes:
mle_alpha, mle_beta = [], []
mom_alpha, mom_beta = [], []
for _ in range(n_simulations):
data = np.random.gamma(shape=true_alpha, scale=1/true_beta, size=n)
# MoM
a_mom, b_mom = gamma_mom_estimator(data)
mom_alpha.append(a_mom)
mom_beta.append(b_mom)
# MLE
try:
a_mle, _, scale_mle = stats.gamma.fit(data, floc=0)
b_mle = 1 / scale_mle
mle_alpha.append(a_mle)
mle_beta.append(b_mle)
except:
pass
results.append({
'n': n,
'MoM_alpha_bias': np.mean(mom_alpha) - true_alpha,
'MoM_alpha_var': np.var(mom_alpha),
'MLE_alpha_bias': np.mean(mle_alpha) - true_alpha,
'MLE_alpha_var': np.var(mle_alpha),
})
# Display results
import pandas as pd
df_results = pd.DataFrame(results)
print("MLE vs MoM comparison (Gamma distribution α estimation):")
print(df_results.to_string(index=False))
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Bias
axes[0].plot(df_results['n'], np.abs(df_results['MoM_alpha_bias']), 'r-o', label='MoM')
axes[0].plot(df_results['n'], np.abs(df_results['MLE_alpha_bias']), 'b-s', label='MLE')
axes[0].set_xlabel('Sample size (n)')
axes[0].set_ylabel('|Bias|')
axes[0].set_title('Bias Comparison')
axes[0].legend()
axes[0].set_xscale('log')
# Variance
axes[1].plot(df_results['n'], df_results['MoM_alpha_var'], 'r-o', label='MoM')
axes[1].plot(df_results['n'], df_results['MLE_alpha_var'], 'b-s', label='MLE')
axes[1].set_xlabel('Sample size (n)')
axes[1].set_ylabel('Variance')
axes[1].set_title('Variance Comparison')
axes[1].legend()
axes[1].set_xscale('log')
axes[1].set_yscale('log')
plt.tight_layout()
plt.show()
6. Practical Example: Distribution Fitting¶
6.1 Comparing Multiple Distribution Fits¶
# Fit multiple distributions to data and compare
np.random.seed(42)
# Real data (generated from gamma distribution)
true_data = np.random.gamma(shape=3, scale=2, size=500)
# Fit multiple distributions
distributions = {
'Normal': stats.norm,
'Gamma': stats.gamma,
'Lognormal': stats.lognorm,
'Exponential': stats.expon,
'Weibull': stats.weibull_min
}
fit_results = {}
for name, dist in distributions.items():
try:
params = dist.fit(true_data)
# Calculate log-likelihood
log_likelihood = np.sum(dist.logpdf(true_data, *params))
# AIC = -2*logL + 2*k
k = len(params)
aic = -2 * log_likelihood + 2 * k
fit_results[name] = {'params': params, 'logL': log_likelihood, 'AIC': aic}
except Exception as e:
fit_results[name] = {'error': str(e)}
# Display results
print("Distribution fitting results (sorted by AIC):")
print("-" * 60)
print(f"{'Distribution':<15} {'Log-Likelihood':<18} {'AIC':<12}")
print("-" * 60)
sorted_results = sorted(fit_results.items(), key=lambda x: x[1].get('AIC', np.inf))
for name, result in sorted_results:
if 'AIC' in result:
print(f"{name:<15} {result['logL']:<18.2f} {result['AIC']:<12.2f}")
# Visualization
fig, ax = plt.subplots(figsize=(12, 6))
ax.hist(true_data, bins=50, density=True, alpha=0.5, label='Data', color='gray')
x = np.linspace(0.01, true_data.max(), 200)
colors = plt.cm.Set1(np.linspace(0, 1, len(distributions)))
for (name, dist), color in zip(distributions.items(), colors):
if name in fit_results and 'params' in fit_results[name]:
params = fit_results[name]['params']
ax.plot(x, dist.pdf(x, *params), linewidth=2, color=color,
label=f'{name} (AIC={fit_results[name]["AIC"]:.1f})')
ax.set_xlabel('x')
ax.set_ylabel('Density')
ax.set_title('Comparison of Multiple Distribution Fits')
ax.legend()
ax.set_xlim(0, true_data.max())
plt.show()
print(f"\nBest fitting distribution: {sorted_results[0][0]}")
6.2 Goodness-of-Fit Tests¶
# Kolmogorov-Smirnov test and Anderson-Darling test
# Goodness-of-fit test for normal distribution
np.random.seed(42)
data_normal = np.random.normal(50, 10, 200)
data_skewed = np.random.gamma(2, 5, 200)
def goodness_of_fit_tests(data, dist_name='norm'):
"""Perform goodness-of-fit tests"""
# Fit distribution
if dist_name == 'norm':
params = stats.norm.fit(data)
dist = stats.norm(*params)
elif dist_name == 'expon':
params = stats.expon.fit(data)
dist = stats.expon(*params)
# KS test
ks_stat, ks_pvalue = stats.kstest(data, dist.cdf)
# Shapiro-Wilk test (for normality)
if dist_name == 'norm':
sw_stat, sw_pvalue = stats.shapiro(data)
else:
sw_stat, sw_pvalue = np.nan, np.nan
return {
'KS_statistic': ks_stat,
'KS_pvalue': ks_pvalue,
'SW_statistic': sw_stat,
'SW_pvalue': sw_pvalue
}
print("Goodness-of-fit test results:")
print("=" * 60)
print("\nNormal data:")
results_normal = goodness_of_fit_tests(data_normal, 'norm')
print(f" KS test: statistic={results_normal['KS_statistic']:.4f}, p={results_normal['KS_pvalue']:.4f}")
print(f" SW test: statistic={results_normal['SW_statistic']:.4f}, p={results_normal['SW_pvalue']:.4f}")
print(f" → Fits normal distribution (p > 0.05)")
print("\nSkewed data:")
results_skewed = goodness_of_fit_tests(data_skewed, 'norm')
print(f" KS test: statistic={results_skewed['KS_statistic']:.4f}, p={results_skewed['KS_pvalue']:.4f}")
print(f" SW test: statistic={results_skewed['SW_statistic']:.4f}, p={results_skewed['SW_pvalue']:.4f}")
print(f" → Does not fit normal distribution (p < 0.05)")
# Q-Q plots
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
stats.probplot(data_normal, dist="norm", plot=axes[0])
axes[0].set_title('Q-Q Plot for Normal Data')
axes[0].grid(alpha=0.3)
stats.probplot(data_skewed, dist="norm", plot=axes[1])
axes[1].set_title('Q-Q Plot for Skewed Data')
axes[1].grid(alpha=0.3)
plt.tight_layout()
plt.show()
Practice Problems¶
Problem 1: Sampling Distribution¶
For a normal population with mean 100 and standard deviation 15, when drawing a sample of size 36: - (a) What are the expected value and standard error of the sample mean? - (b) What is the probability that the sample mean is between 95 and 105?
Problem 2: MLE¶
Given the following data follows a Poisson distribution, find the MLE for λ and estimate a 95% confidence interval.
data = [3, 5, 4, 2, 6, 3, 4, 5, 2, 4]
Problem 3: Method of Moments¶
Derive the method of moments estimators for a and b in the uniform distribution U(a, b). (Hint: E[X] = (a+b)/2, Var(X) = (b-a)²/12)
Summary¶
| Concept | Key Content | Python Function |
|---|---|---|
| Sample mean distribution | X̄ ~ N(μ, σ²/n) | Simulation |
| Standard error | SE = σ/√n | s / np.sqrt(n) |
| Unbiasedness | E[θ̂] = θ | Use ddof=1 |
| Consistency | θ̂ → θ as n → ∞ | Simulation |
| MLE | Maximize L(θ) | scipy.stats.fit() |
| Method of moments | Sample moments = Population moments | Derive formula |
| Goodness-of-fit | KS, Anderson-Darling | stats.kstest() |