1"""
204_regression_analysis.py
3
4Demonstrates regression analysis techniques:
5- OLS regression from scratch
6- Multiple regression
7- Polynomial regression
8- Residual analysis
9- R-squared and adjusted R-squared
10- Confidence and prediction intervals
11"""
12
13import numpy as np
14from scipy import stats
15
16try:
17 import matplotlib.pyplot as plt
18 HAS_PLT = True
19except ImportError:
20 HAS_PLT = False
21 print("matplotlib not available; skipping plots\n")
22
23
24def print_section(title):
25 """Print formatted section header."""
26 print("\n" + "=" * 70)
27 print(f" {title}")
28 print("=" * 70)
29
30
31def simple_ols_from_scratch():
32 """Demonstrate OLS regression from scratch."""
33 print_section("1. Simple OLS Regression (From Scratch)")
34
35 # Generate data
36 np.random.seed(42)
37 n = 100
38 x = np.random.uniform(0, 10, n)
39 true_slope = 2.5
40 true_intercept = 5.0
41 noise = np.random.normal(0, 2, n)
42 y = true_intercept + true_slope * x + noise
43
44 print(f"Generated data: n = {n}")
45 print(f"True model: y = {true_intercept} + {true_slope}*x + ε")
46
47 # Calculate OLS estimates from scratch
48 x_mean = np.mean(x)
49 y_mean = np.mean(y)
50
51 # Slope: β₁ = Cov(x,y) / Var(x)
52 numerator = np.sum((x - x_mean) * (y - y_mean))
53 denominator = np.sum((x - x_mean)**2)
54 beta_1 = numerator / denominator
55
56 # Intercept: β₀ = ȳ - β₁*x̄
57 beta_0 = y_mean - beta_1 * x_mean
58
59 print(f"\nOLS estimates:")
60 print(f" β₀ (intercept): {beta_0:.4f}")
61 print(f" β₁ (slope): {beta_1:.4f}")
62
63 # Predictions and residuals
64 y_pred = beta_0 + beta_1 * x
65 residuals = y - y_pred
66
67 # Sum of squares
68 ss_total = np.sum((y - y_mean)**2)
69 ss_residual = np.sum(residuals**2)
70 ss_regression = np.sum((y_pred - y_mean)**2)
71
72 print(f"\nSum of squares:")
73 print(f" SS_total: {ss_total:.2f}")
74 print(f" SS_regression: {ss_regression:.2f}")
75 print(f" SS_residual: {ss_residual:.2f}")
76
77 # R-squared
78 r_squared = 1 - (ss_residual / ss_total)
79 print(f"\nR²: {r_squared:.4f}")
80
81 # Standard error of regression
82 dof = n - 2
83 se_regression = np.sqrt(ss_residual / dof)
84 print(f"Standard error of regression: {se_regression:.4f}")
85
86 # Standard errors of coefficients
87 se_beta_1 = se_regression / np.sqrt(denominator)
88 se_beta_0 = se_regression * np.sqrt(1/n + x_mean**2/denominator)
89
90 print(f"\nStandard errors:")
91 print(f" SE(β₀): {se_beta_0:.4f}")
92 print(f" SE(β₁): {se_beta_1:.4f}")
93
94 # t-statistics and p-values
95 t_beta_0 = beta_0 / se_beta_0
96 t_beta_1 = beta_1 / se_beta_1
97 p_beta_0 = 2 * (1 - stats.t.cdf(abs(t_beta_0), dof))
98 p_beta_1 = 2 * (1 - stats.t.cdf(abs(t_beta_1), dof))
99
100 print(f"\nt-statistics:")
101 print(f" t(β₀): {t_beta_0:.4f} (p={p_beta_0:.6f})")
102 print(f" t(β₁): {t_beta_1:.4f} (p={p_beta_1:.6f})")
103
104 if HAS_PLT:
105 plt.figure(figsize=(10, 5))
106 plt.scatter(x, y, alpha=0.6, label='Data')
107 plt.plot(x, y_pred, 'r-', linewidth=2, label=f'Fit: y={beta_0:.2f}+{beta_1:.2f}x')
108 plt.xlabel('x')
109 plt.ylabel('y')
110 plt.title('Simple Linear Regression')
111 plt.legend()
112 plt.grid(True, alpha=0.3)
113 plt.savefig('/tmp/simple_regression.png', dpi=100)
114 print("\n[Plot saved to /tmp/simple_regression.png]")
115 plt.close()
116
117
118def multiple_regression():
119 """Demonstrate multiple regression."""
120 print_section("2. Multiple Regression")
121
122 # Generate data with multiple predictors
123 np.random.seed(123)
124 n = 150
125 x1 = np.random.uniform(0, 10, n)
126 x2 = np.random.uniform(0, 5, n)
127 x3 = np.random.uniform(-2, 2, n)
128
129 # True model: y = 3 + 2*x1 - 1.5*x2 + 0.8*x3 + noise
130 true_coeffs = np.array([3.0, 2.0, -1.5, 0.8])
131 noise = np.random.normal(0, 1.5, n)
132 y = true_coeffs[0] + true_coeffs[1]*x1 + true_coeffs[2]*x2 + true_coeffs[3]*x3 + noise
133
134 print(f"Generated data: n = {n}")
135 print(f"True coefficients: {true_coeffs}")
136
137 # Design matrix
138 X = np.column_stack([np.ones(n), x1, x2, x3])
139
140 # OLS solution: β = (X'X)⁻¹X'y
141 XtX = X.T @ X
142 Xty = X.T @ y
143 beta = np.linalg.solve(XtX, Xty)
144
145 print(f"\nEstimated coefficients:")
146 print(f" β₀ (intercept): {beta[0]:.4f}")
147 print(f" β₁ (x1): {beta[1]:.4f}")
148 print(f" β₂ (x2): {beta[2]:.4f}")
149 print(f" β₃ (x3): {beta[3]:.4f}")
150
151 # Predictions and residuals
152 y_pred = X @ beta
153 residuals = y - y_pred
154
155 # R-squared
156 ss_total = np.sum((y - np.mean(y))**2)
157 ss_residual = np.sum(residuals**2)
158 r_squared = 1 - (ss_residual / ss_total)
159
160 # Adjusted R-squared
161 k = X.shape[1] - 1 # number of predictors (excluding intercept)
162 adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - k - 1)
163
164 print(f"\nModel fit:")
165 print(f" R²: {r_squared:.4f}")
166 print(f" Adjusted R²: {adj_r_squared:.4f}")
167
168 # Standard errors
169 dof = n - X.shape[1]
170 mse = ss_residual / dof
171 cov_matrix = mse * np.linalg.inv(XtX)
172 se = np.sqrt(np.diag(cov_matrix))
173
174 print(f"\nStandard errors:")
175 for i, s in enumerate(se):
176 print(f" SE(β₁): {s:.4f}")
177
178 # F-statistic for overall model
179 ms_regression = (ss_total - ss_residual) / k
180 ms_residual = ss_residual / dof
181 f_statistic = ms_regression / ms_residual
182 p_value_f = 1 - stats.f.cdf(f_statistic, k, dof)
183
184 print(f"\nOverall model test:")
185 print(f" F-statistic: {f_statistic:.4f}")
186 print(f" p-value: {p_value_f:.6f}")
187
188
189def polynomial_regression():
190 """Demonstrate polynomial regression."""
191 print_section("3. Polynomial Regression")
192
193 # Generate non-linear data
194 np.random.seed(456)
195 n = 80
196 x = np.random.uniform(-3, 3, n)
197 y_true = 2 - 0.5*x + 1.5*x**2 - 0.3*x**3
198 y = y_true + np.random.normal(0, 1, n)
199
200 print(f"Generated non-linear data: n = {n}")
201
202 # Fit polynomials of different degrees
203 degrees = [1, 2, 3, 4]
204
205 print(f"\nFitting polynomials of degree 1-4:")
206
207 results = []
208 for degree in degrees:
209 # Create polynomial features
210 X = np.column_stack([x**i for i in range(degree + 1)])
211
212 # Fit
213 beta = np.linalg.lstsq(X, y, rcond=None)[0]
214 y_pred = X @ beta
215
216 # Metrics
217 residuals = y - y_pred
218 ss_total = np.sum((y - np.mean(y))**2)
219 ss_residual = np.sum(residuals**2)
220 r_squared = 1 - (ss_residual / ss_total)
221
222 k = degree
223 adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - k - 1)
224
225 # AIC and BIC
226 log_likelihood = -n/2 * np.log(2*np.pi) - n/2 * np.log(ss_residual/n) - n/2
227 aic = 2 * (degree + 1) - 2 * log_likelihood
228 bic = (degree + 1) * np.log(n) - 2 * log_likelihood
229
230 results.append({
231 'degree': degree,
232 'r2': r_squared,
233 'adj_r2': adj_r_squared,
234 'aic': aic,
235 'bic': bic,
236 'beta': beta
237 })
238
239 print(f"\n Degree {degree}:")
240 print(f" R²: {r_squared:.4f}")
241 print(f" Adjusted R²: {adj_r_squared:.4f}")
242 print(f" AIC: {aic:.2f}")
243 print(f" BIC: {bic:.2f}")
244
245 # Find best model by AIC
246 best_aic_idx = np.argmin([r['aic'] for r in results])
247 print(f"\nBest model by AIC: degree {results[best_aic_idx]['degree']}")
248
249 if HAS_PLT:
250 x_plot = np.linspace(-3, 3, 200)
251
252 fig, axes = plt.subplots(2, 2, figsize=(12, 10))
253 axes = axes.ravel()
254
255 for idx, result in enumerate(results):
256 degree = result['degree']
257 beta = result['beta']
258
259 X_plot = np.column_stack([x_plot**i for i in range(degree + 1)])
260 y_plot = X_plot @ beta
261
262 axes[idx].scatter(x, y, alpha=0.5, s=20)
263 axes[idx].plot(x_plot, y_plot, 'r-', linewidth=2)
264 axes[idx].set_xlabel('x')
265 axes[idx].set_ylabel('y')
266 axes[idx].set_title(f'Degree {degree} (R²={result["r2"]:.3f})')
267 axes[idx].grid(True, alpha=0.3)
268
269 plt.tight_layout()
270 plt.savefig('/tmp/polynomial_regression.png', dpi=100)
271 print("\n[Plot saved to /tmp/polynomial_regression.png]")
272 plt.close()
273
274
275def residual_analysis():
276 """Demonstrate residual analysis."""
277 print_section("4. Residual Analysis")
278
279 # Generate data with heteroscedasticity
280 np.random.seed(789)
281 n = 100
282 x = np.random.uniform(1, 10, n)
283 y = 5 + 2*x + np.random.normal(0, 0.5*x, n) # Variance increases with x
284
285 # Fit model
286 X = np.column_stack([np.ones(n), x])
287 beta = np.linalg.lstsq(X, y, rcond=None)[0]
288 y_pred = X @ beta
289 residuals = y - y_pred
290
291 print(f"Fitted model: y = {beta[0]:.2f} + {beta[1]:.2f}*x")
292
293 # Standardized residuals
294 residual_std = np.std(residuals, ddof=2)
295 standardized_residuals = residuals / residual_std
296
297 print(f"\nResidual statistics:")
298 print(f" Mean: {np.mean(residuals):.6f}")
299 print(f" Std: {residual_std:.4f}")
300 print(f" Min: {np.min(residuals):.4f}")
301 print(f" Max: {np.max(residuals):.4f}")
302
303 # Check normality (Shapiro-Wilk test)
304 stat, p_value = stats.shapiro(residuals)
305 print(f"\nShapiro-Wilk normality test:")
306 print(f" Statistic: {stat:.4f}")
307 print(f" p-value: {p_value:.4f}")
308 if p_value > 0.05:
309 print(f" Residuals appear normally distributed")
310 else:
311 print(f" Residuals may not be normally distributed")
312
313 # Durbin-Watson statistic (autocorrelation)
314 dw = np.sum(np.diff(residuals)**2) / np.sum(residuals**2)
315 print(f"\nDurbin-Watson statistic: {dw:.4f}")
316 print(f" (Values near 2 indicate no autocorrelation)")
317
318 if HAS_PLT:
319 fig, axes = plt.subplots(2, 2, figsize=(12, 10))
320
321 # Residuals vs fitted
322 axes[0, 0].scatter(y_pred, residuals, alpha=0.6)
323 axes[0, 0].axhline(0, color='red', linestyle='--')
324 axes[0, 0].set_xlabel('Fitted values')
325 axes[0, 0].set_ylabel('Residuals')
326 axes[0, 0].set_title('Residuals vs Fitted')
327 axes[0, 0].grid(True, alpha=0.3)
328
329 # Q-Q plot
330 stats.probplot(residuals, dist="norm", plot=axes[0, 1])
331 axes[0, 1].set_title('Normal Q-Q Plot')
332 axes[0, 1].grid(True, alpha=0.3)
333
334 # Histogram of residuals
335 axes[1, 0].hist(residuals, bins=20, edgecolor='black', alpha=0.7)
336 axes[1, 0].set_xlabel('Residuals')
337 axes[1, 0].set_ylabel('Frequency')
338 axes[1, 0].set_title('Histogram of Residuals')
339 axes[1, 0].grid(True, alpha=0.3)
340
341 # Residuals vs order
342 axes[1, 1].scatter(range(len(residuals)), residuals, alpha=0.6)
343 axes[1, 1].axhline(0, color='red', linestyle='--')
344 axes[1, 1].set_xlabel('Observation order')
345 axes[1, 1].set_ylabel('Residuals')
346 axes[1, 1].set_title('Residuals vs Order')
347 axes[1, 1].grid(True, alpha=0.3)
348
349 plt.tight_layout()
350 plt.savefig('/tmp/residual_analysis.png', dpi=100)
351 print("\n[Plot saved to /tmp/residual_analysis.png]")
352 plt.close()
353
354
355def confidence_prediction_intervals():
356 """Demonstrate confidence and prediction intervals."""
357 print_section("5. Confidence and Prediction Intervals")
358
359 # Generate data
360 np.random.seed(999)
361 n = 50
362 x = np.random.uniform(0, 10, n)
363 y = 5 + 2*x + np.random.normal(0, 2, n)
364
365 # Fit model
366 X = np.column_stack([np.ones(n), x])
367 beta = np.linalg.lstsq(X, y, rcond=None)[0]
368 y_pred = X @ beta
369 residuals = y - y_pred
370
371 # Calculate intervals
372 dof = n - 2
373 mse = np.sum(residuals**2) / dof
374 XtX_inv = np.linalg.inv(X.T @ X)
375
376 # Points for prediction
377 x_new = np.linspace(0, 10, 100)
378 X_new = np.column_stack([np.ones(len(x_new)), x_new])
379 y_new = X_new @ beta
380
381 # Confidence interval for mean response
382 conf_se = np.sqrt(mse * np.sum((X_new @ XtX_inv) * X_new, axis=1))
383 t_crit = stats.t.ppf(0.975, dof)
384 conf_lower = y_new - t_crit * conf_se
385 conf_upper = y_new + t_crit * conf_se
386
387 # Prediction interval for new observations
388 pred_se = np.sqrt(mse * (1 + np.sum((X_new @ XtX_inv) * X_new, axis=1)))
389 pred_lower = y_new - t_crit * pred_se
390 pred_upper = y_new + t_crit * pred_se
391
392 print(f"Fitted model: y = {beta[0]:.2f} + {beta[1]:.2f}*x")
393 print(f"\nInterval widths at x=5:")
394
395 # Find index closest to x=5
396 idx = np.argmin(np.abs(x_new - 5))
397
398 print(f" Confidence interval width: {conf_upper[idx] - conf_lower[idx]:.4f}")
399 print(f" Prediction interval width: {pred_upper[idx] - pred_lower[idx]:.4f}")
400 print(f"\nPrediction intervals are wider (account for individual variation)")
401
402 if HAS_PLT:
403 plt.figure(figsize=(10, 6))
404 plt.scatter(x, y, alpha=0.6, label='Data')
405 plt.plot(x_new, y_new, 'r-', linewidth=2, label='Fit')
406 plt.fill_between(x_new, conf_lower, conf_upper, alpha=0.2,
407 color='red', label='95% Confidence Interval')
408 plt.fill_between(x_new, pred_lower, pred_upper, alpha=0.1,
409 color='blue', label='95% Prediction Interval')
410 plt.xlabel('x')
411 plt.ylabel('y')
412 plt.title('Confidence vs Prediction Intervals')
413 plt.legend()
414 plt.grid(True, alpha=0.3)
415 plt.savefig('/tmp/intervals.png', dpi=100)
416 print("\n[Plot saved to /tmp/intervals.png]")
417 plt.close()
418
419
420def main():
421 """Run all demonstrations."""
422 print("=" * 70)
423 print(" REGRESSION ANALYSIS DEMONSTRATIONS")
424 print("=" * 70)
425
426 np.random.seed(42)
427
428 simple_ols_from_scratch()
429 multiple_regression()
430 polynomial_regression()
431 residual_analysis()
432 confidence_prediction_intervals()
433
434 print("\n" + "=" * 70)
435 print(" All demonstrations completed successfully!")
436 print("=" * 70)
437
438
439if __name__ == "__main__":
440 main()