04_regression_analysis.py

  1"""
  204_regression_analysis.py
  3
  4Demonstrates regression analysis techniques:
  5- OLS regression from scratch
  6- Multiple regression
  7- Polynomial regression
  8- Residual analysis
  9- R-squared and adjusted R-squared
 10- Confidence and prediction intervals
 11"""
 12
 13import numpy as np
 14from scipy import stats
 15
 16try:
 17    import matplotlib.pyplot as plt
 18    HAS_PLT = True
 19except ImportError:
 20    HAS_PLT = False
 21    print("matplotlib not available; skipping plots\n")
 22
 23
 24def print_section(title):
 25    """Print formatted section header."""
 26    print("\n" + "=" * 70)
 27    print(f"  {title}")
 28    print("=" * 70)
 29
 30
 31def simple_ols_from_scratch():
 32    """Demonstrate OLS regression from scratch."""
 33    print_section("1. Simple OLS Regression (From Scratch)")
 34
 35    # Generate data
 36    np.random.seed(42)
 37    n = 100
 38    x = np.random.uniform(0, 10, n)
 39    true_slope = 2.5
 40    true_intercept = 5.0
 41    noise = np.random.normal(0, 2, n)
 42    y = true_intercept + true_slope * x + noise
 43
 44    print(f"Generated data: n = {n}")
 45    print(f"True model: y = {true_intercept} + {true_slope}*x + ε")
 46
 47    # Calculate OLS estimates from scratch
 48    x_mean = np.mean(x)
 49    y_mean = np.mean(y)
 50
 51    # Slope: β₁ = Cov(x,y) / Var(x)
 52    numerator = np.sum((x - x_mean) * (y - y_mean))
 53    denominator = np.sum((x - x_mean)**2)
 54    beta_1 = numerator / denominator
 55
 56    # Intercept: β₀ = ȳ - β₁*x̄
 57    beta_0 = y_mean - beta_1 * x_mean
 58
 59    print(f"\nOLS estimates:")
 60    print(f"  β₀ (intercept): {beta_0:.4f}")
 61    print(f"  β₁ (slope): {beta_1:.4f}")
 62
 63    # Predictions and residuals
 64    y_pred = beta_0 + beta_1 * x
 65    residuals = y - y_pred
 66
 67    # Sum of squares
 68    ss_total = np.sum((y - y_mean)**2)
 69    ss_residual = np.sum(residuals**2)
 70    ss_regression = np.sum((y_pred - y_mean)**2)
 71
 72    print(f"\nSum of squares:")
 73    print(f"  SS_total: {ss_total:.2f}")
 74    print(f"  SS_regression: {ss_regression:.2f}")
 75    print(f"  SS_residual: {ss_residual:.2f}")
 76
 77    # R-squared
 78    r_squared = 1 - (ss_residual / ss_total)
 79    print(f"\nR²: {r_squared:.4f}")
 80
 81    # Standard error of regression
 82    dof = n - 2
 83    se_regression = np.sqrt(ss_residual / dof)
 84    print(f"Standard error of regression: {se_regression:.4f}")
 85
 86    # Standard errors of coefficients
 87    se_beta_1 = se_regression / np.sqrt(denominator)
 88    se_beta_0 = se_regression * np.sqrt(1/n + x_mean**2/denominator)
 89
 90    print(f"\nStandard errors:")
 91    print(f"  SE(β₀): {se_beta_0:.4f}")
 92    print(f"  SE(β₁): {se_beta_1:.4f}")
 93
 94    # t-statistics and p-values
 95    t_beta_0 = beta_0 / se_beta_0
 96    t_beta_1 = beta_1 / se_beta_1
 97    p_beta_0 = 2 * (1 - stats.t.cdf(abs(t_beta_0), dof))
 98    p_beta_1 = 2 * (1 - stats.t.cdf(abs(t_beta_1), dof))
 99
100    print(f"\nt-statistics:")
101    print(f"  t(β₀): {t_beta_0:.4f} (p={p_beta_0:.6f})")
102    print(f"  t(β₁): {t_beta_1:.4f} (p={p_beta_1:.6f})")
103
104    if HAS_PLT:
105        plt.figure(figsize=(10, 5))
106        plt.scatter(x, y, alpha=0.6, label='Data')
107        plt.plot(x, y_pred, 'r-', linewidth=2, label=f'Fit: y={beta_0:.2f}+{beta_1:.2f}x')
108        plt.xlabel('x')
109        plt.ylabel('y')
110        plt.title('Simple Linear Regression')
111        plt.legend()
112        plt.grid(True, alpha=0.3)
113        plt.savefig('/tmp/simple_regression.png', dpi=100)
114        print("\n[Plot saved to /tmp/simple_regression.png]")
115        plt.close()
116
117
118def multiple_regression():
119    """Demonstrate multiple regression."""
120    print_section("2. Multiple Regression")
121
122    # Generate data with multiple predictors
123    np.random.seed(123)
124    n = 150
125    x1 = np.random.uniform(0, 10, n)
126    x2 = np.random.uniform(0, 5, n)
127    x3 = np.random.uniform(-2, 2, n)
128
129    # True model: y = 3 + 2*x1 - 1.5*x2 + 0.8*x3 + noise
130    true_coeffs = np.array([3.0, 2.0, -1.5, 0.8])
131    noise = np.random.normal(0, 1.5, n)
132    y = true_coeffs[0] + true_coeffs[1]*x1 + true_coeffs[2]*x2 + true_coeffs[3]*x3 + noise
133
134    print(f"Generated data: n = {n}")
135    print(f"True coefficients: {true_coeffs}")
136
137    # Design matrix
138    X = np.column_stack([np.ones(n), x1, x2, x3])
139
140    # OLS solution: β = (X'X)⁻¹X'y
141    XtX = X.T @ X
142    Xty = X.T @ y
143    beta = np.linalg.solve(XtX, Xty)
144
145    print(f"\nEstimated coefficients:")
146    print(f"  β₀ (intercept): {beta[0]:.4f}")
147    print(f"  β₁ (x1): {beta[1]:.4f}")
148    print(f"  β₂ (x2): {beta[2]:.4f}")
149    print(f"  β₃ (x3): {beta[3]:.4f}")
150
151    # Predictions and residuals
152    y_pred = X @ beta
153    residuals = y - y_pred
154
155    # R-squared
156    ss_total = np.sum((y - np.mean(y))**2)
157    ss_residual = np.sum(residuals**2)
158    r_squared = 1 - (ss_residual / ss_total)
159
160    # Adjusted R-squared
161    k = X.shape[1] - 1  # number of predictors (excluding intercept)
162    adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - k - 1)
163
164    print(f"\nModel fit:")
165    print(f"  R²: {r_squared:.4f}")
166    print(f"  Adjusted R²: {adj_r_squared:.4f}")
167
168    # Standard errors
169    dof = n - X.shape[1]
170    mse = ss_residual / dof
171    cov_matrix = mse * np.linalg.inv(XtX)
172    se = np.sqrt(np.diag(cov_matrix))
173
174    print(f"\nStandard errors:")
175    for i, s in enumerate(se):
176        print(f"  SE(β₁): {s:.4f}")
177
178    # F-statistic for overall model
179    ms_regression = (ss_total - ss_residual) / k
180    ms_residual = ss_residual / dof
181    f_statistic = ms_regression / ms_residual
182    p_value_f = 1 - stats.f.cdf(f_statistic, k, dof)
183
184    print(f"\nOverall model test:")
185    print(f"  F-statistic: {f_statistic:.4f}")
186    print(f"  p-value: {p_value_f:.6f}")
187
188
189def polynomial_regression():
190    """Demonstrate polynomial regression."""
191    print_section("3. Polynomial Regression")
192
193    # Generate non-linear data
194    np.random.seed(456)
195    n = 80
196    x = np.random.uniform(-3, 3, n)
197    y_true = 2 - 0.5*x + 1.5*x**2 - 0.3*x**3
198    y = y_true + np.random.normal(0, 1, n)
199
200    print(f"Generated non-linear data: n = {n}")
201
202    # Fit polynomials of different degrees
203    degrees = [1, 2, 3, 4]
204
205    print(f"\nFitting polynomials of degree 1-4:")
206
207    results = []
208    for degree in degrees:
209        # Create polynomial features
210        X = np.column_stack([x**i for i in range(degree + 1)])
211
212        # Fit
213        beta = np.linalg.lstsq(X, y, rcond=None)[0]
214        y_pred = X @ beta
215
216        # Metrics
217        residuals = y - y_pred
218        ss_total = np.sum((y - np.mean(y))**2)
219        ss_residual = np.sum(residuals**2)
220        r_squared = 1 - (ss_residual / ss_total)
221
222        k = degree
223        adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - k - 1)
224
225        # AIC and BIC
226        log_likelihood = -n/2 * np.log(2*np.pi) - n/2 * np.log(ss_residual/n) - n/2
227        aic = 2 * (degree + 1) - 2 * log_likelihood
228        bic = (degree + 1) * np.log(n) - 2 * log_likelihood
229
230        results.append({
231            'degree': degree,
232            'r2': r_squared,
233            'adj_r2': adj_r_squared,
234            'aic': aic,
235            'bic': bic,
236            'beta': beta
237        })
238
239        print(f"\n  Degree {degree}:")
240        print(f"    R²: {r_squared:.4f}")
241        print(f"    Adjusted R²: {adj_r_squared:.4f}")
242        print(f"    AIC: {aic:.2f}")
243        print(f"    BIC: {bic:.2f}")
244
245    # Find best model by AIC
246    best_aic_idx = np.argmin([r['aic'] for r in results])
247    print(f"\nBest model by AIC: degree {results[best_aic_idx]['degree']}")
248
249    if HAS_PLT:
250        x_plot = np.linspace(-3, 3, 200)
251
252        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
253        axes = axes.ravel()
254
255        for idx, result in enumerate(results):
256            degree = result['degree']
257            beta = result['beta']
258
259            X_plot = np.column_stack([x_plot**i for i in range(degree + 1)])
260            y_plot = X_plot @ beta
261
262            axes[idx].scatter(x, y, alpha=0.5, s=20)
263            axes[idx].plot(x_plot, y_plot, 'r-', linewidth=2)
264            axes[idx].set_xlabel('x')
265            axes[idx].set_ylabel('y')
266            axes[idx].set_title(f'Degree {degree} (R²={result["r2"]:.3f})')
267            axes[idx].grid(True, alpha=0.3)
268
269        plt.tight_layout()
270        plt.savefig('/tmp/polynomial_regression.png', dpi=100)
271        print("\n[Plot saved to /tmp/polynomial_regression.png]")
272        plt.close()
273
274
275def residual_analysis():
276    """Demonstrate residual analysis."""
277    print_section("4. Residual Analysis")
278
279    # Generate data with heteroscedasticity
280    np.random.seed(789)
281    n = 100
282    x = np.random.uniform(1, 10, n)
283    y = 5 + 2*x + np.random.normal(0, 0.5*x, n)  # Variance increases with x
284
285    # Fit model
286    X = np.column_stack([np.ones(n), x])
287    beta = np.linalg.lstsq(X, y, rcond=None)[0]
288    y_pred = X @ beta
289    residuals = y - y_pred
290
291    print(f"Fitted model: y = {beta[0]:.2f} + {beta[1]:.2f}*x")
292
293    # Standardized residuals
294    residual_std = np.std(residuals, ddof=2)
295    standardized_residuals = residuals / residual_std
296
297    print(f"\nResidual statistics:")
298    print(f"  Mean: {np.mean(residuals):.6f}")
299    print(f"  Std: {residual_std:.4f}")
300    print(f"  Min: {np.min(residuals):.4f}")
301    print(f"  Max: {np.max(residuals):.4f}")
302
303    # Check normality (Shapiro-Wilk test)
304    stat, p_value = stats.shapiro(residuals)
305    print(f"\nShapiro-Wilk normality test:")
306    print(f"  Statistic: {stat:.4f}")
307    print(f"  p-value: {p_value:.4f}")
308    if p_value > 0.05:
309        print(f"  Residuals appear normally distributed")
310    else:
311        print(f"  Residuals may not be normally distributed")
312
313    # Durbin-Watson statistic (autocorrelation)
314    dw = np.sum(np.diff(residuals)**2) / np.sum(residuals**2)
315    print(f"\nDurbin-Watson statistic: {dw:.4f}")
316    print(f"  (Values near 2 indicate no autocorrelation)")
317
318    if HAS_PLT:
319        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
320
321        # Residuals vs fitted
322        axes[0, 0].scatter(y_pred, residuals, alpha=0.6)
323        axes[0, 0].axhline(0, color='red', linestyle='--')
324        axes[0, 0].set_xlabel('Fitted values')
325        axes[0, 0].set_ylabel('Residuals')
326        axes[0, 0].set_title('Residuals vs Fitted')
327        axes[0, 0].grid(True, alpha=0.3)
328
329        # Q-Q plot
330        stats.probplot(residuals, dist="norm", plot=axes[0, 1])
331        axes[0, 1].set_title('Normal Q-Q Plot')
332        axes[0, 1].grid(True, alpha=0.3)
333
334        # Histogram of residuals
335        axes[1, 0].hist(residuals, bins=20, edgecolor='black', alpha=0.7)
336        axes[1, 0].set_xlabel('Residuals')
337        axes[1, 0].set_ylabel('Frequency')
338        axes[1, 0].set_title('Histogram of Residuals')
339        axes[1, 0].grid(True, alpha=0.3)
340
341        # Residuals vs order
342        axes[1, 1].scatter(range(len(residuals)), residuals, alpha=0.6)
343        axes[1, 1].axhline(0, color='red', linestyle='--')
344        axes[1, 1].set_xlabel('Observation order')
345        axes[1, 1].set_ylabel('Residuals')
346        axes[1, 1].set_title('Residuals vs Order')
347        axes[1, 1].grid(True, alpha=0.3)
348
349        plt.tight_layout()
350        plt.savefig('/tmp/residual_analysis.png', dpi=100)
351        print("\n[Plot saved to /tmp/residual_analysis.png]")
352        plt.close()
353
354
355def confidence_prediction_intervals():
356    """Demonstrate confidence and prediction intervals."""
357    print_section("5. Confidence and Prediction Intervals")
358
359    # Generate data
360    np.random.seed(999)
361    n = 50
362    x = np.random.uniform(0, 10, n)
363    y = 5 + 2*x + np.random.normal(0, 2, n)
364
365    # Fit model
366    X = np.column_stack([np.ones(n), x])
367    beta = np.linalg.lstsq(X, y, rcond=None)[0]
368    y_pred = X @ beta
369    residuals = y - y_pred
370
371    # Calculate intervals
372    dof = n - 2
373    mse = np.sum(residuals**2) / dof
374    XtX_inv = np.linalg.inv(X.T @ X)
375
376    # Points for prediction
377    x_new = np.linspace(0, 10, 100)
378    X_new = np.column_stack([np.ones(len(x_new)), x_new])
379    y_new = X_new @ beta
380
381    # Confidence interval for mean response
382    conf_se = np.sqrt(mse * np.sum((X_new @ XtX_inv) * X_new, axis=1))
383    t_crit = stats.t.ppf(0.975, dof)
384    conf_lower = y_new - t_crit * conf_se
385    conf_upper = y_new + t_crit * conf_se
386
387    # Prediction interval for new observations
388    pred_se = np.sqrt(mse * (1 + np.sum((X_new @ XtX_inv) * X_new, axis=1)))
389    pred_lower = y_new - t_crit * pred_se
390    pred_upper = y_new + t_crit * pred_se
391
392    print(f"Fitted model: y = {beta[0]:.2f} + {beta[1]:.2f}*x")
393    print(f"\nInterval widths at x=5:")
394
395    # Find index closest to x=5
396    idx = np.argmin(np.abs(x_new - 5))
397
398    print(f"  Confidence interval width: {conf_upper[idx] - conf_lower[idx]:.4f}")
399    print(f"  Prediction interval width: {pred_upper[idx] - pred_lower[idx]:.4f}")
400    print(f"\nPrediction intervals are wider (account for individual variation)")
401
402    if HAS_PLT:
403        plt.figure(figsize=(10, 6))
404        plt.scatter(x, y, alpha=0.6, label='Data')
405        plt.plot(x_new, y_new, 'r-', linewidth=2, label='Fit')
406        plt.fill_between(x_new, conf_lower, conf_upper, alpha=0.2,
407                        color='red', label='95% Confidence Interval')
408        plt.fill_between(x_new, pred_lower, pred_upper, alpha=0.1,
409                        color='blue', label='95% Prediction Interval')
410        plt.xlabel('x')
411        plt.ylabel('y')
412        plt.title('Confidence vs Prediction Intervals')
413        plt.legend()
414        plt.grid(True, alpha=0.3)
415        plt.savefig('/tmp/intervals.png', dpi=100)
416        print("\n[Plot saved to /tmp/intervals.png]")
417        plt.close()
418
419
420def main():
421    """Run all demonstrations."""
422    print("=" * 70)
423    print("  REGRESSION ANALYSIS DEMONSTRATIONS")
424    print("=" * 70)
425
426    np.random.seed(42)
427
428    simple_ols_from_scratch()
429    multiple_regression()
430    polynomial_regression()
431    residual_analysis()
432    confidence_prediction_intervals()
433
434    print("\n" + "=" * 70)
435    print("  All demonstrations completed successfully!")
436    print("=" * 70)
437
438
439if __name__ == "__main__":
440    main()