1"""
209_multivariate.py
3
4Demonstrates multivariate statistical analysis:
5- PCA (from scratch and sklearn)
6- Factor analysis concept
7- Multivariate normal distribution
8- Mahalanobis distance
9- Canonical correlation
10"""
11
12import numpy as np
13from scipy import stats
14
15try:
16 import matplotlib.pyplot as plt
17 HAS_PLT = True
18except ImportError:
19 HAS_PLT = False
20 print("matplotlib not available; skipping plots\n")
21
22
23def print_section(title):
24 """Print formatted section header."""
25 print("\n" + "=" * 70)
26 print(f" {title}")
27 print("=" * 70)
28
29
30def pca_from_scratch():
31 """Demonstrate PCA from scratch."""
32 print_section("1. Principal Component Analysis (From Scratch)")
33
34 # Generate correlated data
35 np.random.seed(42)
36 n = 200
37
38 # True principal components
39 mean = np.array([5, 3])
40 # Data with specific correlation structure
41 x1 = np.random.normal(0, 3, n)
42 x2 = np.random.normal(0, 1, n)
43
44 # Rotate to create correlation
45 angle = np.pi / 4 # 45 degrees
46 rotation = np.array([
47 [np.cos(angle), -np.sin(angle)],
48 [np.sin(angle), np.cos(angle)]
49 ])
50
51 X_rotated = (np.column_stack([x1, x2]) @ rotation.T) + mean
52
53 print(f"Generated data: n = {n}, p = 2")
54 print(f"Mean: {np.mean(X_rotated, axis=0)}")
55
56 # Center the data
57 X_centered = X_rotated - np.mean(X_rotated, axis=0)
58
59 # Covariance matrix
60 cov_matrix = np.cov(X_centered.T)
61
62 print(f"\nCovariance matrix:")
63 print(cov_matrix)
64
65 # Eigen decomposition
66 eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
67
68 # Sort by eigenvalues (descending)
69 idx = eigenvalues.argsort()[::-1]
70 eigenvalues = eigenvalues[idx]
71 eigenvectors = eigenvectors[:, idx]
72
73 print(f"\nEigenvalues: {eigenvalues}")
74 print(f"Eigenvectors (principal components):")
75 print(eigenvectors)
76
77 # Variance explained
78 total_var = np.sum(eigenvalues)
79 var_explained = eigenvalues / total_var
80
81 print(f"\nVariance explained:")
82 for i, var in enumerate(var_explained):
83 print(f" PC{i+1}: {var:.4f} ({var*100:.2f}%)")
84
85 # Project data onto principal components
86 X_pca = X_centered @ eigenvectors
87
88 print(f"\nPCA scores statistics:")
89 print(f" PC1: mean={np.mean(X_pca[:,0]):.4f}, std={np.std(X_pca[:,0], ddof=1):.4f}")
90 print(f" PC2: mean={np.mean(X_pca[:,1]):.4f}, std={np.std(X_pca[:,1], ddof=1):.4f}")
91
92 # Reconstruction
93 X_reconstructed = X_pca @ eigenvectors.T + np.mean(X_rotated, axis=0)
94 reconstruction_error = np.mean((X_rotated - X_reconstructed)**2)
95
96 print(f"\nReconstruction error (using all PCs): {reconstruction_error:.6f}")
97
98 if HAS_PLT:
99 fig, axes = plt.subplots(1, 2, figsize=(12, 5))
100
101 # Original data
102 axes[0].scatter(X_rotated[:, 0], X_rotated[:, 1], alpha=0.6)
103 axes[0].set_xlabel('X₁')
104 axes[0].set_ylabel('X₂')
105 axes[0].set_title('Original Data')
106 axes[0].grid(True, alpha=0.3)
107 axes[0].axis('equal')
108
109 # Plot principal component directions
110 origin = np.mean(X_rotated, axis=0)
111 for i in range(2):
112 direction = eigenvectors[:, i] * np.sqrt(eigenvalues[i]) * 2
113 axes[0].arrow(origin[0], origin[1], direction[0], direction[1],
114 head_width=0.3, head_length=0.2, fc=f'C{i+1}', ec=f'C{i+1}',
115 linewidth=2, label=f'PC{i+1}')
116 axes[0].legend()
117
118 # PCA scores
119 axes[1].scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.6)
120 axes[1].axhline(0, color='black', linewidth=0.5)
121 axes[1].axvline(0, color='black', linewidth=0.5)
122 axes[1].set_xlabel('PC1')
123 axes[1].set_ylabel('PC2')
124 axes[1].set_title('PCA Scores')
125 axes[1].grid(True, alpha=0.3)
126 axes[1].axis('equal')
127
128 plt.tight_layout()
129 plt.savefig('/tmp/pca_scratch.png', dpi=100)
130 print("\n[Plot saved to /tmp/pca_scratch.png]")
131 plt.close()
132
133
134def pca_dimensionality_reduction():
135 """Demonstrate PCA for dimensionality reduction."""
136 print_section("2. PCA for Dimensionality Reduction")
137
138 # Generate high-dimensional data with low intrinsic dimensionality
139 np.random.seed(123)
140 n = 150
141 k = 3 # True dimensionality
142
143 # Generate in low dimension
144 Z = np.random.randn(n, k)
145
146 # Random projection to high dimension
147 p = 10
148 A = np.random.randn(k, p)
149 X = Z @ A
150
151 # Add small noise
152 X += np.random.randn(n, p) * 0.5
153
154 print(f"Data: n = {n}, p = {p}")
155 print(f"True intrinsic dimension: k = {k}")
156
157 # PCA
158 X_centered = X - np.mean(X, axis=0)
159 cov_matrix = np.cov(X_centered.T)
160 eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
161
162 # Sort
163 idx = eigenvalues.argsort()[::-1]
164 eigenvalues = eigenvalues[idx]
165 eigenvectors = eigenvectors[:, idx]
166
167 # Variance explained
168 total_var = np.sum(eigenvalues)
169 var_explained = eigenvalues / total_var
170 cumulative_var = np.cumsum(var_explained)
171
172 print(f"\nVariance explained by each PC:")
173 for i in range(p):
174 print(f" PC{i+1:2d}: {var_explained[i]:.4f} (cumulative: {cumulative_var[i]:.4f})")
175
176 # Find number of PCs for 95% variance
177 n_components_95 = np.argmax(cumulative_var >= 0.95) + 1
178
179 print(f"\nComponents needed for 95% variance: {n_components_95}")
180 print(f"Dimension reduction: {p} → {n_components_95}")
181
182 # Project to reduced dimension
183 X_reduced = X_centered @ eigenvectors[:, :n_components_95]
184
185 print(f"\nReduced representation shape: {X_reduced.shape}")
186
187 # Reconstruction from reduced representation
188 X_reconstructed = X_reduced @ eigenvectors[:, :n_components_95].T + np.mean(X, axis=0)
189 reconstruction_error = np.mean((X - X_reconstructed)**2)
190
191 print(f"Reconstruction error (using {n_components_95} PCs): {reconstruction_error:.4f}")
192
193 if HAS_PLT:
194 plt.figure(figsize=(10, 6))
195 plt.bar(range(1, p + 1), var_explained, alpha=0.7, label='Individual')
196 plt.plot(range(1, p + 1), cumulative_var, 'r-o', linewidth=2, label='Cumulative')
197 plt.axhline(0.95, color='green', linestyle='--', label='95% threshold')
198 plt.xlabel('Principal Component')
199 plt.ylabel('Variance Explained')
200 plt.title('PCA Scree Plot')
201 plt.legend()
202 plt.grid(True, alpha=0.3)
203 plt.savefig('/tmp/pca_scree.png', dpi=100)
204 print("\n[Plot saved to /tmp/pca_scree.png]")
205 plt.close()
206
207
208def multivariate_normal():
209 """Demonstrate multivariate normal distribution."""
210 print_section("3. Multivariate Normal Distribution")
211
212 # Define multivariate normal
213 mean = np.array([2, 3])
214 cov = np.array([
215 [2.0, 0.8],
216 [0.8, 1.5]
217 ])
218
219 print(f"Multivariate normal distribution")
220 print(f"Mean: {mean}")
221 print(f"\nCovariance matrix:")
222 print(cov)
223
224 # Correlation
225 std = np.sqrt(np.diag(cov))
226 corr = cov / np.outer(std, std)
227
228 print(f"\nCorrelation matrix:")
229 print(corr)
230
231 # Generate samples
232 np.random.seed(456)
233 n = 500
234 samples = np.random.multivariate_normal(mean, cov, n)
235
236 print(f"\nGenerated {n} samples")
237 print(f"Sample mean: {np.mean(samples, axis=0)}")
238 print(f"Sample covariance:")
239 print(np.cov(samples.T))
240
241 # Conditional distribution: X₁ | X₂ = x₂
242 x2_given = 4.0
243
244 # Conditional mean and variance
245 mu1, mu2 = mean
246 sigma11, sigma12, sigma22 = cov[0, 0], cov[0, 1], cov[1, 1]
247
248 mu_cond = mu1 + (sigma12 / sigma22) * (x2_given - mu2)
249 var_cond = sigma11 - sigma12**2 / sigma22
250
251 print(f"\nConditional distribution X₁ | X₂={x2_given}:")
252 print(f" Mean: {mu_cond:.4f}")
253 print(f" Variance: {var_cond:.4f}")
254
255 if HAS_PLT:
256 plt.figure(figsize=(10, 8))
257
258 # Scatter plot
259 plt.scatter(samples[:, 0], samples[:, 1], alpha=0.5, s=20)
260
261 # Confidence ellipse
262 from matplotlib.patches import Ellipse
263 eigenvalues, eigenvectors = np.linalg.eig(cov)
264 angle = np.degrees(np.arctan2(eigenvectors[1, 0], eigenvectors[0, 0]))
265 width, height = 2 * np.sqrt(5.991 * eigenvalues) # 95% confidence
266 ellipse = Ellipse(mean, width, height, angle=angle,
267 facecolor='none', edgecolor='red', linewidth=2,
268 label='95% confidence ellipse')
269 plt.gca().add_patch(ellipse)
270
271 plt.scatter([mean[0]], [mean[1]], color='red', s=100, marker='x',
272 zorder=5, label='Mean')
273
274 plt.xlabel('X₁')
275 plt.ylabel('X₂')
276 plt.title('Bivariate Normal Distribution')
277 plt.legend()
278 plt.grid(True, alpha=0.3)
279 plt.axis('equal')
280 plt.savefig('/tmp/multivariate_normal.png', dpi=100)
281 print("\n[Plot saved to /tmp/multivariate_normal.png]")
282 plt.close()
283
284
285def mahalanobis_distance():
286 """Demonstrate Mahalanobis distance."""
287 print_section("4. Mahalanobis Distance")
288
289 # Generate data
290 np.random.seed(789)
291 mean = np.array([0, 0])
292 cov = np.array([
293 [2.0, 1.2],
294 [1.2, 1.5]
295 ])
296
297 n = 200
298 X = np.random.multivariate_normal(mean, cov, n)
299
300 print(f"Generated {n} points from bivariate normal")
301 print(f"Mean: {mean}")
302
303 # Add some outliers
304 outliers = np.array([
305 [5, 5],
306 [-6, 4],
307 [4, -5]
308 ])
309
310 X_with_outliers = np.vstack([X, outliers])
311
312 print(f"\nAdded {len(outliers)} outliers")
313
314 # Calculate Mahalanobis distance
315 cov_inv = np.linalg.inv(cov)
316
317 def mahalanobis(x, mean, cov_inv):
318 """Calculate Mahalanobis distance."""
319 diff = x - mean
320 return np.sqrt(diff @ cov_inv @ diff)
321
322 distances = np.array([mahalanobis(x, mean, cov_inv) for x in X_with_outliers])
323
324 # Also calculate Euclidean distance for comparison
325 euclidean_distances = np.sqrt(np.sum((X_with_outliers - mean)**2, axis=1))
326
327 print(f"\nDistance statistics:")
328 print(f" Mahalanobis: mean={np.mean(distances[:n]):.2f}, std={np.std(distances[:n], ddof=1):.2f}")
329 print(f" Euclidean: mean={np.mean(euclidean_distances[:n]):.2f}, std={np.std(euclidean_distances[:n], ddof=1):.2f}")
330
331 # Outlier detection using Mahalanobis distance
332 # Chi-square threshold (95% for 2 dimensions)
333 threshold = np.sqrt(stats.chi2.ppf(0.95, df=2))
334
335 outlier_indices = np.where(distances > threshold)[0]
336
337 print(f"\nOutlier detection (threshold={threshold:.2f}):")
338 print(f" Detected outliers: {len(outlier_indices)}")
339 print(f" True outliers start at index {n}")
340
341 print(f"\nOutlier distances:")
342 for idx in range(n, len(X_with_outliers)):
343 print(f" Point {idx}: Mahalanobis={distances[idx]:.2f}, Euclidean={euclidean_distances[idx]:.2f}")
344
345 if HAS_PLT:
346 fig, axes = plt.subplots(1, 2, figsize=(12, 5))
347
348 # Mahalanobis distance
349 scatter = axes[0].scatter(X_with_outliers[:n, 0], X_with_outliers[:n, 1],
350 c=distances[:n], cmap='viridis', alpha=0.6, s=30)
351 axes[0].scatter(X_with_outliers[n:, 0], X_with_outliers[n:, 1],
352 c='red', marker='x', s=200, linewidths=3, label='Outliers')
353 axes[0].scatter([mean[0]], [mean[1]], c='red', marker='*', s=200, zorder=5)
354
355 # Confidence ellipse
356 from matplotlib.patches import Ellipse
357 eigenvalues, eigenvectors = np.linalg.eig(cov)
358 angle = np.degrees(np.arctan2(eigenvectors[1, 0], eigenvectors[0, 0]))
359 width, height = 2 * threshold * np.sqrt(eigenvalues)
360 ellipse = Ellipse(mean, width, height, angle=angle,
361 facecolor='none', edgecolor='red', linewidth=2, linestyle='--')
362 axes[0].add_patch(ellipse)
363
364 axes[0].set_xlabel('X₁')
365 axes[0].set_ylabel('X₂')
366 axes[0].set_title('Mahalanobis Distance')
367 axes[0].legend()
368 axes[0].grid(True, alpha=0.3)
369 fig.colorbar(scatter, ax=axes[0], label='Mahalanobis Distance')
370
371 # Comparison histogram
372 axes[1].hist(distances[:n], bins=30, alpha=0.7, label='Normal points', density=True)
373 axes[1].axvline(threshold, color='red', linestyle='--', linewidth=2, label=f'Threshold ({threshold:.2f})')
374
375 for i, d in enumerate(distances[n:]):
376 axes[1].axvline(d, color='red', alpha=0.5, linewidth=2)
377
378 # Overlay chi-square distribution
379 x = np.linspace(0, distances.max(), 200)
380 axes[1].plot(x, stats.chi2.pdf(x**2, df=2) * 2 * x, 'black', linestyle='--',
381 linewidth=2, label='χ²(2) density')
382
383 axes[1].set_xlabel('Mahalanobis Distance')
384 axes[1].set_ylabel('Density')
385 axes[1].set_title('Distance Distribution')
386 axes[1].legend()
387 axes[1].grid(True, alpha=0.3)
388
389 plt.tight_layout()
390 plt.savefig('/tmp/mahalanobis.png', dpi=100)
391 print("\n[Plot saved to /tmp/mahalanobis.png]")
392 plt.close()
393
394
395def canonical_correlation():
396 """Demonstrate canonical correlation analysis concept."""
397 print_section("5. Canonical Correlation Analysis")
398
399 # Generate two sets of variables with correlation
400 np.random.seed(999)
401 n = 150
402
403 # Latent variable
404 z = np.random.normal(0, 1, n)
405
406 # First set of variables (related to z)
407 X1 = z + np.random.normal(0, 0.5, n)
408 X2 = 0.8 * z + np.random.normal(0, 0.7, n)
409 X3 = np.random.normal(0, 1, n) # Independent
410
411 X = np.column_stack([X1, X2, X3])
412
413 # Second set of variables (also related to z)
414 Y1 = 0.9 * z + np.random.normal(0, 0.6, n)
415 Y2 = z + np.random.normal(0, 0.5, n)
416 Y3 = np.random.normal(0, 1, n) # Independent
417
418 Y = np.column_stack([Y1, Y2, Y3])
419
420 print(f"Two sets of variables:")
421 print(f" X: {X.shape[1]} variables")
422 print(f" Y: {Y.shape[1]} variables")
423 print(f" Both related through latent variable z")
424
425 # Simple approach: correlations between all pairs
426 print(f"\nPairwise correlations (X vs Y):")
427 print(f" Y1 Y2 Y3")
428
429 for i in range(3):
430 row = []
431 for j in range(3):
432 corr = np.corrcoef(X[:, i], Y[:, j])[0, 1]
433 row.append(f"{corr:7.3f}")
434 print(f" X{i+1} " + " ".join(row))
435
436 # Simplified canonical correlation (first canonical variate)
437 # Find linear combinations that maximize correlation
438
439 # Center the data
440 X_centered = X - np.mean(X, axis=0)
441 Y_centered = Y - np.mean(Y, axis=0)
442
443 # Covariance matrices
444 Cxx = np.cov(X_centered.T)
445 Cyy = np.cov(Y_centered.T)
446 Cxy = np.cov(X_centered.T, Y_centered.T)[:3, 3:]
447
448 # Canonical correlation via eigendecomposition
449 # Solve: Cxx^(-1) Cxy Cyy^(-1) Cyx
450 try:
451 M = np.linalg.inv(Cxx) @ Cxy @ np.linalg.inv(Cyy) @ Cxy.T
452 eigenvalues, eigenvectors = np.linalg.eig(M)
453
454 # Canonical correlations are sqrt of eigenvalues
455 canonical_corrs = np.sqrt(np.abs(eigenvalues))
456
457 idx = canonical_corrs.argsort()[::-1]
458 canonical_corrs = canonical_corrs[idx]
459
460 print(f"\nCanonical correlations:")
461 for i, cc in enumerate(canonical_corrs):
462 print(f" CC{i+1}: {cc:.4f}")
463
464 print(f"\nFirst canonical correlation captures the latent relationship")
465
466 except np.linalg.LinAlgError:
467 print(f"\nSingular covariance matrix; CCA not computable with this approach")
468
469
470def main():
471 """Run all demonstrations."""
472 print("=" * 70)
473 print(" MULTIVARIATE ANALYSIS DEMONSTRATIONS")
474 print("=" * 70)
475
476 np.random.seed(42)
477
478 pca_from_scratch()
479 pca_dimensionality_reduction()
480 multivariate_normal()
481 mahalanobis_distance()
482 canonical_correlation()
483
484 print("\n" + "=" * 70)
485 print(" All demonstrations completed successfully!")
486 print("=" * 70)
487
488
489if __name__ == "__main__":
490 main()