02_svd_pca.py

Download
python 303 lines 9.4 KB
  1"""
  2Singular Value Decomposition (SVD) and Principal Component Analysis (PCA)
  3
  4Demonstrates:
  5- SVD decomposition with NumPy
  6- PCA from scratch (centering → covariance → eigendecomposition)
  7- Comparison with sklearn PCA
  8- Explained variance visualization
  9- ML application: dimensionality reduction
 10
 11Dependencies: numpy, matplotlib, sklearn
 12"""
 13
 14import numpy as np
 15import matplotlib.pyplot as plt
 16from sklearn.decomposition import PCA as SklearnPCA
 17from sklearn.datasets import load_iris
 18
 19
 20def demonstrate_svd():
 21    """Demonstrate Singular Value Decomposition"""
 22    print("=" * 60)
 23    print("SINGULAR VALUE DECOMPOSITION (SVD)")
 24    print("=" * 60)
 25
 26    # Create a matrix
 27    A = np.array([[1, 2, 3],
 28                  [4, 5, 6],
 29                  [7, 8, 9],
 30                  [10, 11, 12]], dtype=float)
 31
 32    print(f"\nOriginal matrix A (4x3):\n{A}")
 33    print(f"Shape: {A.shape}")
 34    print(f"Rank: {np.linalg.matrix_rank(A)}")
 35
 36    # Perform SVD: A = U @ S @ V^T
 37    U, s, Vt = np.linalg.svd(A, full_matrices=True)
 38
 39    print(f"\n--- SVD Components ---")
 40    print(f"U shape: {U.shape} (left singular vectors)")
 41    print(f"s shape: {s.shape} (singular values)")
 42    print(f"V^T shape: {Vt.shape} (right singular vectors)")
 43
 44    print(f"\nSingular values: {s}")
 45
 46    # Reconstruct S matrix (diagonal)
 47    S = np.zeros((U.shape[1], Vt.shape[0]))
 48    S[:len(s), :len(s)] = np.diag(s)
 49
 50    print(f"\nS matrix (diagonal):\n{S}")
 51
 52    # Reconstruct A
 53    A_reconstructed = U @ S @ Vt
 54    print(f"\nReconstructed A:\n{A_reconstructed}")
 55    print(f"Reconstruction error: {np.linalg.norm(A - A_reconstructed):.10f}")
 56
 57    # Low-rank approximation
 58    print("\n--- Low-Rank Approximation ---")
 59    k = 2  # Keep only top 2 singular values
 60
 61    S_k = S.copy()
 62    S_k[:, k:] = 0  # Zero out smaller singular values
 63
 64    A_approx = U @ S_k @ Vt
 65    print(f"\nRank-{k} approximation:\n{A_approx}")
 66
 67    error = np.linalg.norm(A - A_approx)
 68    print(f"Approximation error (Frobenius norm): {error:.4f}")
 69
 70    # Energy preserved
 71    energy_original = np.sum(s**2)
 72    energy_kept = np.sum(s[:k]**2)
 73    print(f"\nEnergy preserved: {energy_kept/energy_original*100:.2f}%")
 74
 75
 76def pca_from_scratch(X, n_components=2):
 77    """
 78    Implement PCA from scratch
 79
 80    Steps:
 81    1. Center the data (subtract mean)
 82    2. Compute covariance matrix
 83    3. Compute eigenvalues and eigenvectors
 84    4. Sort by eigenvalues (descending)
 85    5. Project data onto top k eigenvectors
 86    """
 87    print("\n--- PCA From Scratch ---")
 88
 89    # Step 1: Center the data
 90    mean = np.mean(X, axis=0)
 91    X_centered = X - mean
 92    print(f"Data shape: {X.shape}")
 93    print(f"Mean: {mean}")
 94
 95    # Step 2: Compute covariance matrix
 96    n_samples = X.shape[0]
 97    cov_matrix = (X_centered.T @ X_centered) / (n_samples - 1)
 98    print(f"\nCovariance matrix shape: {cov_matrix.shape}")
 99    print(f"Covariance matrix:\n{cov_matrix}")
100
101    # Step 3: Compute eigenvalues and eigenvectors
102    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
103
104    # Step 4: Sort by eigenvalues (descending)
105    idx = eigenvalues.argsort()[::-1]
106    eigenvalues = eigenvalues[idx]
107    eigenvectors = eigenvectors[:, idx]
108
109    print(f"\nEigenvalues: {eigenvalues}")
110    print(f"\nTop {n_components} eigenvectors (principal components):")
111    print(eigenvectors[:, :n_components])
112
113    # Step 5: Project data
114    X_pca = X_centered @ eigenvectors[:, :n_components]
115
116    # Explained variance
117    explained_variance = eigenvalues / np.sum(eigenvalues)
118    explained_variance_cumsum = np.cumsum(explained_variance)
119
120    print(f"\nExplained variance ratio: {explained_variance[:n_components]}")
121    print(f"Cumulative explained variance: {explained_variance_cumsum[:n_components]}")
122
123    return X_pca, eigenvectors[:, :n_components], explained_variance, mean
124
125
126def pca_with_sklearn(X, n_components=2):
127    """Use sklearn PCA for comparison"""
128    print("\n--- PCA with sklearn ---")
129
130    pca = SklearnPCA(n_components=n_components)
131    X_pca = pca.fit_transform(X)
132
133    print(f"Transformed shape: {X_pca.shape}")
134    print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
135    print(f"Cumulative variance: {np.cumsum(pca.explained_variance_ratio_)}")
136    print(f"\nPrincipal components (shape {pca.components_.shape}):")
137    print(pca.components_.T)
138
139    return X_pca, pca
140
141
142def demonstrate_pca_on_iris():
143    """Demonstrate PCA on Iris dataset"""
144    print("\n" + "=" * 60)
145    print("PCA ON IRIS DATASET")
146    print("=" * 60)
147
148    # Load Iris dataset
149    iris = load_iris()
150    X = iris.data  # 150 samples, 4 features
151    y = iris.target
152    feature_names = iris.feature_names
153
154    print(f"\nIris dataset:")
155    print(f"Samples: {X.shape[0]}, Features: {X.shape[1]}")
156    print(f"Feature names: {feature_names}")
157    print(f"First 5 samples:\n{X[:5]}")
158
159    # Apply custom PCA
160    print("\n" + "=" * 60)
161    print("CUSTOM PCA")
162    print("=" * 60)
163    X_pca_custom, components_custom, explained_var, mean = pca_from_scratch(X, n_components=2)
164
165    # Apply sklearn PCA
166    print("\n" + "=" * 60)
167    print("SKLEARN PCA")
168    print("=" * 60)
169    X_pca_sklearn, pca_model = pca_with_sklearn(X, n_components=2)
170
171    # Verify they match (may differ in sign)
172    print("\n--- Verification ---")
173    print(f"Custom PCA result (first 5):\n{X_pca_custom[:5]}")
174    print(f"Sklearn PCA result (first 5):\n{X_pca_sklearn[:5]}")
175
176    # Check if they're the same (up to sign flip)
177    diff = np.abs(X_pca_custom) - np.abs(X_pca_sklearn)
178    print(f"\nMax absolute difference: {np.max(np.abs(diff)):.10f}")
179
180    return X_pca_sklearn, y, pca_model, X, feature_names
181
182
183def visualize_pca_results(X_pca, y, pca_model, X_original, feature_names):
184    """Visualize PCA results"""
185    print("\n" + "=" * 60)
186    print("VISUALIZING PCA RESULTS")
187    print("=" * 60)
188
189    fig = plt.figure(figsize=(15, 5))
190
191    # Plot 1: PCA projection (2D)
192    ax1 = fig.add_subplot(131)
193    scatter = ax1.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis',
194                         edgecolors='k', s=50, alpha=0.7)
195    ax1.set_xlabel('PC1')
196    ax1.set_ylabel('PC2')
197    ax1.set_title('PCA Projection (2D)')
198    ax1.grid(True, alpha=0.3)
199    plt.colorbar(scatter, ax=ax1, label='Class')
200
201    # Plot 2: Explained variance
202    ax2 = fig.add_subplot(132)
203    explained_var = pca_model.explained_variance_ratio_
204    cumsum_var = np.cumsum(explained_var)
205
206    x_pos = np.arange(len(explained_var))
207    ax2.bar(x_pos, explained_var, alpha=0.7, label='Individual')
208    ax2.plot(x_pos, cumsum_var, 'ro-', label='Cumulative')
209    ax2.set_xlabel('Principal Component')
210    ax2.set_ylabel('Explained Variance Ratio')
211    ax2.set_title('Explained Variance by Component')
212    ax2.set_xticks(x_pos)
213    ax2.set_xticklabels([f'PC{i+1}' for i in x_pos])
214    ax2.legend()
215    ax2.grid(True, alpha=0.3, axis='y')
216
217    # Plot 3: Component loadings
218    ax3 = fig.add_subplot(133)
219    components = pca_model.components_.T  # Transpose to get (features, components)
220
221    x_pos = np.arange(len(feature_names))
222    width = 0.35
223
224    bars1 = ax3.bar(x_pos - width/2, components[:, 0], width,
225                    label='PC1', alpha=0.7)
226    bars2 = ax3.bar(x_pos + width/2, components[:, 1], width,
227                    label='PC2', alpha=0.7)
228
229    ax3.set_xlabel('Features')
230    ax3.set_ylabel('Loading')
231    ax3.set_title('Principal Component Loadings')
232    ax3.set_xticks(x_pos)
233    ax3.set_xticklabels([name.split()[0] for name in feature_names], rotation=45, ha='right')
234    ax3.legend()
235    ax3.grid(True, alpha=0.3, axis='y')
236    ax3.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
237
238    plt.tight_layout()
239    plt.savefig('/opt/projects/01_Personal/03_Study/examples/Math_for_AI/pca_visualization.png', dpi=150)
240    print("Visualization saved to pca_visualization.png")
241    plt.close()
242
243
244def svd_vs_pca():
245    """Show relationship between SVD and PCA"""
246    print("\n" + "=" * 60)
247    print("RELATIONSHIP BETWEEN SVD AND PCA")
248    print("=" * 60)
249
250    # Generate random data
251    np.random.seed(42)
252    X = np.random.randn(100, 5)
253
254    # Center the data
255    X_centered = X - np.mean(X, axis=0)
256
257    print(f"Data shape: {X.shape}")
258
259    # Method 1: PCA via eigendecomposition of covariance
260    cov = (X_centered.T @ X_centered) / (X.shape[0] - 1)
261    eigenvalues, eigenvectors = np.linalg.eig(cov)
262    idx = eigenvalues.argsort()[::-1]
263    eigenvalues = eigenvalues[idx]
264    eigenvectors = eigenvectors[:, idx]
265
266    print("\n--- PCA via Eigendecomposition ---")
267    print(f"Eigenvalues: {eigenvalues}")
268
269    # Method 2: PCA via SVD
270    U, s, Vt = np.linalg.svd(X_centered, full_matrices=False)
271
272    # Singular values squared relate to eigenvalues
273    eigenvalues_from_svd = (s**2) / (X.shape[0] - 1)
274
275    print("\n--- PCA via SVD ---")
276    print(f"Eigenvalues from SVD: {eigenvalues_from_svd}")
277
278    # V^T from SVD gives principal components
279    print("\n--- Comparison ---")
280    print(f"Eigenvalues match: {np.allclose(eigenvalues, eigenvalues_from_svd)}")
281    print(f"Eigenvectors match (up to sign): {np.allclose(np.abs(eigenvectors), np.abs(Vt.T))}")
282
283    print("\nKey insight:")
284    print("- SVD of centered data X = U @ S @ V^T")
285    print("- V^T contains principal components (eigenvectors of X^T X)")
286    print("- Singular values s relate to eigenvalues: eigenvalue = s^2 / (n-1)")
287
288
289if __name__ == "__main__":
290    # Run demonstrations
291    demonstrate_svd()
292
293    print("\n")
294    X_pca, y, pca_model, X_original, feature_names = demonstrate_pca_on_iris()
295
296    visualize_pca_results(X_pca, y, pca_model, X_original, feature_names)
297
298    svd_vs_pca()
299
300    print("\n" + "=" * 60)
301    print("All demonstrations completed!")
302    print("=" * 60)