09_multivariate.py

Download
python 491 lines 15.3 KB
  1"""
  209_multivariate.py
  3
  4Demonstrates multivariate statistical analysis:
  5- PCA (from scratch and sklearn)
  6- Factor analysis concept
  7- Multivariate normal distribution
  8- Mahalanobis distance
  9- Canonical correlation
 10"""
 11
 12import numpy as np
 13from scipy import stats
 14
 15try:
 16    import matplotlib.pyplot as plt
 17    HAS_PLT = True
 18except ImportError:
 19    HAS_PLT = False
 20    print("matplotlib not available; skipping plots\n")
 21
 22
 23def print_section(title):
 24    """Print formatted section header."""
 25    print("\n" + "=" * 70)
 26    print(f"  {title}")
 27    print("=" * 70)
 28
 29
 30def pca_from_scratch():
 31    """Demonstrate PCA from scratch."""
 32    print_section("1. Principal Component Analysis (From Scratch)")
 33
 34    # Generate correlated data
 35    np.random.seed(42)
 36    n = 200
 37
 38    # True principal components
 39    mean = np.array([5, 3])
 40    # Data with specific correlation structure
 41    x1 = np.random.normal(0, 3, n)
 42    x2 = np.random.normal(0, 1, n)
 43
 44    # Rotate to create correlation
 45    angle = np.pi / 4  # 45 degrees
 46    rotation = np.array([
 47        [np.cos(angle), -np.sin(angle)],
 48        [np.sin(angle), np.cos(angle)]
 49    ])
 50
 51    X_rotated = (np.column_stack([x1, x2]) @ rotation.T) + mean
 52
 53    print(f"Generated data: n = {n}, p = 2")
 54    print(f"Mean: {np.mean(X_rotated, axis=0)}")
 55
 56    # Center the data
 57    X_centered = X_rotated - np.mean(X_rotated, axis=0)
 58
 59    # Covariance matrix
 60    cov_matrix = np.cov(X_centered.T)
 61
 62    print(f"\nCovariance matrix:")
 63    print(cov_matrix)
 64
 65    # Eigen decomposition
 66    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
 67
 68    # Sort by eigenvalues (descending)
 69    idx = eigenvalues.argsort()[::-1]
 70    eigenvalues = eigenvalues[idx]
 71    eigenvectors = eigenvectors[:, idx]
 72
 73    print(f"\nEigenvalues: {eigenvalues}")
 74    print(f"Eigenvectors (principal components):")
 75    print(eigenvectors)
 76
 77    # Variance explained
 78    total_var = np.sum(eigenvalues)
 79    var_explained = eigenvalues / total_var
 80
 81    print(f"\nVariance explained:")
 82    for i, var in enumerate(var_explained):
 83        print(f"  PC{i+1}: {var:.4f} ({var*100:.2f}%)")
 84
 85    # Project data onto principal components
 86    X_pca = X_centered @ eigenvectors
 87
 88    print(f"\nPCA scores statistics:")
 89    print(f"  PC1: mean={np.mean(X_pca[:,0]):.4f}, std={np.std(X_pca[:,0], ddof=1):.4f}")
 90    print(f"  PC2: mean={np.mean(X_pca[:,1]):.4f}, std={np.std(X_pca[:,1], ddof=1):.4f}")
 91
 92    # Reconstruction
 93    X_reconstructed = X_pca @ eigenvectors.T + np.mean(X_rotated, axis=0)
 94    reconstruction_error = np.mean((X_rotated - X_reconstructed)**2)
 95
 96    print(f"\nReconstruction error (using all PCs): {reconstruction_error:.6f}")
 97
 98    if HAS_PLT:
 99        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
100
101        # Original data
102        axes[0].scatter(X_rotated[:, 0], X_rotated[:, 1], alpha=0.6)
103        axes[0].set_xlabel('X₁')
104        axes[0].set_ylabel('X₂')
105        axes[0].set_title('Original Data')
106        axes[0].grid(True, alpha=0.3)
107        axes[0].axis('equal')
108
109        # Plot principal component directions
110        origin = np.mean(X_rotated, axis=0)
111        for i in range(2):
112            direction = eigenvectors[:, i] * np.sqrt(eigenvalues[i]) * 2
113            axes[0].arrow(origin[0], origin[1], direction[0], direction[1],
114                         head_width=0.3, head_length=0.2, fc=f'C{i+1}', ec=f'C{i+1}',
115                         linewidth=2, label=f'PC{i+1}')
116        axes[0].legend()
117
118        # PCA scores
119        axes[1].scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.6)
120        axes[1].axhline(0, color='black', linewidth=0.5)
121        axes[1].axvline(0, color='black', linewidth=0.5)
122        axes[1].set_xlabel('PC1')
123        axes[1].set_ylabel('PC2')
124        axes[1].set_title('PCA Scores')
125        axes[1].grid(True, alpha=0.3)
126        axes[1].axis('equal')
127
128        plt.tight_layout()
129        plt.savefig('/tmp/pca_scratch.png', dpi=100)
130        print("\n[Plot saved to /tmp/pca_scratch.png]")
131        plt.close()
132
133
134def pca_dimensionality_reduction():
135    """Demonstrate PCA for dimensionality reduction."""
136    print_section("2. PCA for Dimensionality Reduction")
137
138    # Generate high-dimensional data with low intrinsic dimensionality
139    np.random.seed(123)
140    n = 150
141    k = 3  # True dimensionality
142
143    # Generate in low dimension
144    Z = np.random.randn(n, k)
145
146    # Random projection to high dimension
147    p = 10
148    A = np.random.randn(k, p)
149    X = Z @ A
150
151    # Add small noise
152    X += np.random.randn(n, p) * 0.5
153
154    print(f"Data: n = {n}, p = {p}")
155    print(f"True intrinsic dimension: k = {k}")
156
157    # PCA
158    X_centered = X - np.mean(X, axis=0)
159    cov_matrix = np.cov(X_centered.T)
160    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
161
162    # Sort
163    idx = eigenvalues.argsort()[::-1]
164    eigenvalues = eigenvalues[idx]
165    eigenvectors = eigenvectors[:, idx]
166
167    # Variance explained
168    total_var = np.sum(eigenvalues)
169    var_explained = eigenvalues / total_var
170    cumulative_var = np.cumsum(var_explained)
171
172    print(f"\nVariance explained by each PC:")
173    for i in range(p):
174        print(f"  PC{i+1:2d}: {var_explained[i]:.4f} (cumulative: {cumulative_var[i]:.4f})")
175
176    # Find number of PCs for 95% variance
177    n_components_95 = np.argmax(cumulative_var >= 0.95) + 1
178
179    print(f"\nComponents needed for 95% variance: {n_components_95}")
180    print(f"Dimension reduction: {p}{n_components_95}")
181
182    # Project to reduced dimension
183    X_reduced = X_centered @ eigenvectors[:, :n_components_95]
184
185    print(f"\nReduced representation shape: {X_reduced.shape}")
186
187    # Reconstruction from reduced representation
188    X_reconstructed = X_reduced @ eigenvectors[:, :n_components_95].T + np.mean(X, axis=0)
189    reconstruction_error = np.mean((X - X_reconstructed)**2)
190
191    print(f"Reconstruction error (using {n_components_95} PCs): {reconstruction_error:.4f}")
192
193    if HAS_PLT:
194        plt.figure(figsize=(10, 6))
195        plt.bar(range(1, p + 1), var_explained, alpha=0.7, label='Individual')
196        plt.plot(range(1, p + 1), cumulative_var, 'r-o', linewidth=2, label='Cumulative')
197        plt.axhline(0.95, color='green', linestyle='--', label='95% threshold')
198        plt.xlabel('Principal Component')
199        plt.ylabel('Variance Explained')
200        plt.title('PCA Scree Plot')
201        plt.legend()
202        plt.grid(True, alpha=0.3)
203        plt.savefig('/tmp/pca_scree.png', dpi=100)
204        print("\n[Plot saved to /tmp/pca_scree.png]")
205        plt.close()
206
207
208def multivariate_normal():
209    """Demonstrate multivariate normal distribution."""
210    print_section("3. Multivariate Normal Distribution")
211
212    # Define multivariate normal
213    mean = np.array([2, 3])
214    cov = np.array([
215        [2.0, 0.8],
216        [0.8, 1.5]
217    ])
218
219    print(f"Multivariate normal distribution")
220    print(f"Mean: {mean}")
221    print(f"\nCovariance matrix:")
222    print(cov)
223
224    # Correlation
225    std = np.sqrt(np.diag(cov))
226    corr = cov / np.outer(std, std)
227
228    print(f"\nCorrelation matrix:")
229    print(corr)
230
231    # Generate samples
232    np.random.seed(456)
233    n = 500
234    samples = np.random.multivariate_normal(mean, cov, n)
235
236    print(f"\nGenerated {n} samples")
237    print(f"Sample mean: {np.mean(samples, axis=0)}")
238    print(f"Sample covariance:")
239    print(np.cov(samples.T))
240
241    # Conditional distribution: X₁ | X₂ = x₂
242    x2_given = 4.0
243
244    # Conditional mean and variance
245    mu1, mu2 = mean
246    sigma11, sigma12, sigma22 = cov[0, 0], cov[0, 1], cov[1, 1]
247
248    mu_cond = mu1 + (sigma12 / sigma22) * (x2_given - mu2)
249    var_cond = sigma11 - sigma12**2 / sigma22
250
251    print(f"\nConditional distribution X₁ | X₂={x2_given}:")
252    print(f"  Mean: {mu_cond:.4f}")
253    print(f"  Variance: {var_cond:.4f}")
254
255    if HAS_PLT:
256        plt.figure(figsize=(10, 8))
257
258        # Scatter plot
259        plt.scatter(samples[:, 0], samples[:, 1], alpha=0.5, s=20)
260
261        # Confidence ellipse
262        from matplotlib.patches import Ellipse
263        eigenvalues, eigenvectors = np.linalg.eig(cov)
264        angle = np.degrees(np.arctan2(eigenvectors[1, 0], eigenvectors[0, 0]))
265        width, height = 2 * np.sqrt(5.991 * eigenvalues)  # 95% confidence
266        ellipse = Ellipse(mean, width, height, angle=angle,
267                         facecolor='none', edgecolor='red', linewidth=2,
268                         label='95% confidence ellipse')
269        plt.gca().add_patch(ellipse)
270
271        plt.scatter([mean[0]], [mean[1]], color='red', s=100, marker='x',
272                   zorder=5, label='Mean')
273
274        plt.xlabel('X₁')
275        plt.ylabel('X₂')
276        plt.title('Bivariate Normal Distribution')
277        plt.legend()
278        plt.grid(True, alpha=0.3)
279        plt.axis('equal')
280        plt.savefig('/tmp/multivariate_normal.png', dpi=100)
281        print("\n[Plot saved to /tmp/multivariate_normal.png]")
282        plt.close()
283
284
285def mahalanobis_distance():
286    """Demonstrate Mahalanobis distance."""
287    print_section("4. Mahalanobis Distance")
288
289    # Generate data
290    np.random.seed(789)
291    mean = np.array([0, 0])
292    cov = np.array([
293        [2.0, 1.2],
294        [1.2, 1.5]
295    ])
296
297    n = 200
298    X = np.random.multivariate_normal(mean, cov, n)
299
300    print(f"Generated {n} points from bivariate normal")
301    print(f"Mean: {mean}")
302
303    # Add some outliers
304    outliers = np.array([
305        [5, 5],
306        [-6, 4],
307        [4, -5]
308    ])
309
310    X_with_outliers = np.vstack([X, outliers])
311
312    print(f"\nAdded {len(outliers)} outliers")
313
314    # Calculate Mahalanobis distance
315    cov_inv = np.linalg.inv(cov)
316
317    def mahalanobis(x, mean, cov_inv):
318        """Calculate Mahalanobis distance."""
319        diff = x - mean
320        return np.sqrt(diff @ cov_inv @ diff)
321
322    distances = np.array([mahalanobis(x, mean, cov_inv) for x in X_with_outliers])
323
324    # Also calculate Euclidean distance for comparison
325    euclidean_distances = np.sqrt(np.sum((X_with_outliers - mean)**2, axis=1))
326
327    print(f"\nDistance statistics:")
328    print(f"  Mahalanobis: mean={np.mean(distances[:n]):.2f}, std={np.std(distances[:n], ddof=1):.2f}")
329    print(f"  Euclidean: mean={np.mean(euclidean_distances[:n]):.2f}, std={np.std(euclidean_distances[:n], ddof=1):.2f}")
330
331    # Outlier detection using Mahalanobis distance
332    # Chi-square threshold (95% for 2 dimensions)
333    threshold = np.sqrt(stats.chi2.ppf(0.95, df=2))
334
335    outlier_indices = np.where(distances > threshold)[0]
336
337    print(f"\nOutlier detection (threshold={threshold:.2f}):")
338    print(f"  Detected outliers: {len(outlier_indices)}")
339    print(f"  True outliers start at index {n}")
340
341    print(f"\nOutlier distances:")
342    for idx in range(n, len(X_with_outliers)):
343        print(f"  Point {idx}: Mahalanobis={distances[idx]:.2f}, Euclidean={euclidean_distances[idx]:.2f}")
344
345    if HAS_PLT:
346        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
347
348        # Mahalanobis distance
349        scatter = axes[0].scatter(X_with_outliers[:n, 0], X_with_outliers[:n, 1],
350                                 c=distances[:n], cmap='viridis', alpha=0.6, s=30)
351        axes[0].scatter(X_with_outliers[n:, 0], X_with_outliers[n:, 1],
352                       c='red', marker='x', s=200, linewidths=3, label='Outliers')
353        axes[0].scatter([mean[0]], [mean[1]], c='red', marker='*', s=200, zorder=5)
354
355        # Confidence ellipse
356        from matplotlib.patches import Ellipse
357        eigenvalues, eigenvectors = np.linalg.eig(cov)
358        angle = np.degrees(np.arctan2(eigenvectors[1, 0], eigenvectors[0, 0]))
359        width, height = 2 * threshold * np.sqrt(eigenvalues)
360        ellipse = Ellipse(mean, width, height, angle=angle,
361                         facecolor='none', edgecolor='red', linewidth=2, linestyle='--')
362        axes[0].add_patch(ellipse)
363
364        axes[0].set_xlabel('X₁')
365        axes[0].set_ylabel('X₂')
366        axes[0].set_title('Mahalanobis Distance')
367        axes[0].legend()
368        axes[0].grid(True, alpha=0.3)
369        fig.colorbar(scatter, ax=axes[0], label='Mahalanobis Distance')
370
371        # Comparison histogram
372        axes[1].hist(distances[:n], bins=30, alpha=0.7, label='Normal points', density=True)
373        axes[1].axvline(threshold, color='red', linestyle='--', linewidth=2, label=f'Threshold ({threshold:.2f})')
374
375        for i, d in enumerate(distances[n:]):
376            axes[1].axvline(d, color='red', alpha=0.5, linewidth=2)
377
378        # Overlay chi-square distribution
379        x = np.linspace(0, distances.max(), 200)
380        axes[1].plot(x, stats.chi2.pdf(x**2, df=2) * 2 * x, 'black', linestyle='--',
381                    linewidth=2, label='χ²(2) density')
382
383        axes[1].set_xlabel('Mahalanobis Distance')
384        axes[1].set_ylabel('Density')
385        axes[1].set_title('Distance Distribution')
386        axes[1].legend()
387        axes[1].grid(True, alpha=0.3)
388
389        plt.tight_layout()
390        plt.savefig('/tmp/mahalanobis.png', dpi=100)
391        print("\n[Plot saved to /tmp/mahalanobis.png]")
392        plt.close()
393
394
395def canonical_correlation():
396    """Demonstrate canonical correlation analysis concept."""
397    print_section("5. Canonical Correlation Analysis")
398
399    # Generate two sets of variables with correlation
400    np.random.seed(999)
401    n = 150
402
403    # Latent variable
404    z = np.random.normal(0, 1, n)
405
406    # First set of variables (related to z)
407    X1 = z + np.random.normal(0, 0.5, n)
408    X2 = 0.8 * z + np.random.normal(0, 0.7, n)
409    X3 = np.random.normal(0, 1, n)  # Independent
410
411    X = np.column_stack([X1, X2, X3])
412
413    # Second set of variables (also related to z)
414    Y1 = 0.9 * z + np.random.normal(0, 0.6, n)
415    Y2 = z + np.random.normal(0, 0.5, n)
416    Y3 = np.random.normal(0, 1, n)  # Independent
417
418    Y = np.column_stack([Y1, Y2, Y3])
419
420    print(f"Two sets of variables:")
421    print(f"  X: {X.shape[1]} variables")
422    print(f"  Y: {Y.shape[1]} variables")
423    print(f"  Both related through latent variable z")
424
425    # Simple approach: correlations between all pairs
426    print(f"\nPairwise correlations (X vs Y):")
427    print(f"       Y1      Y2      Y3")
428
429    for i in range(3):
430        row = []
431        for j in range(3):
432            corr = np.corrcoef(X[:, i], Y[:, j])[0, 1]
433            row.append(f"{corr:7.3f}")
434        print(f"  X{i+1} " + " ".join(row))
435
436    # Simplified canonical correlation (first canonical variate)
437    # Find linear combinations that maximize correlation
438
439    # Center the data
440    X_centered = X - np.mean(X, axis=0)
441    Y_centered = Y - np.mean(Y, axis=0)
442
443    # Covariance matrices
444    Cxx = np.cov(X_centered.T)
445    Cyy = np.cov(Y_centered.T)
446    Cxy = np.cov(X_centered.T, Y_centered.T)[:3, 3:]
447
448    # Canonical correlation via eigendecomposition
449    # Solve: Cxx^(-1) Cxy Cyy^(-1) Cyx
450    try:
451        M = np.linalg.inv(Cxx) @ Cxy @ np.linalg.inv(Cyy) @ Cxy.T
452        eigenvalues, eigenvectors = np.linalg.eig(M)
453
454        # Canonical correlations are sqrt of eigenvalues
455        canonical_corrs = np.sqrt(np.abs(eigenvalues))
456
457        idx = canonical_corrs.argsort()[::-1]
458        canonical_corrs = canonical_corrs[idx]
459
460        print(f"\nCanonical correlations:")
461        for i, cc in enumerate(canonical_corrs):
462            print(f"  CC{i+1}: {cc:.4f}")
463
464        print(f"\nFirst canonical correlation captures the latent relationship")
465
466    except np.linalg.LinAlgError:
467        print(f"\nSingular covariance matrix; CCA not computable with this approach")
468
469
470def main():
471    """Run all demonstrations."""
472    print("=" * 70)
473    print("  MULTIVARIATE ANALYSIS DEMONSTRATIONS")
474    print("=" * 70)
475
476    np.random.seed(42)
477
478    pca_from_scratch()
479    pca_dimensionality_reduction()
480    multivariate_normal()
481    mahalanobis_distance()
482    canonical_correlation()
483
484    print("\n" + "=" * 70)
485    print("  All demonstrations completed successfully!")
486    print("=" * 70)
487
488
489if __name__ == "__main__":
490    main()