1"""
2Singular Value Decomposition (SVD) and Principal Component Analysis (PCA)
3
4Demonstrates:
5- SVD decomposition with NumPy
6- PCA from scratch (centering → covariance → eigendecomposition)
7- Comparison with sklearn PCA
8- Explained variance visualization
9- ML application: dimensionality reduction
10
11Dependencies: numpy, matplotlib, sklearn
12"""
13
14import numpy as np
15import matplotlib.pyplot as plt
16from sklearn.decomposition import PCA as SklearnPCA
17from sklearn.datasets import load_iris
18
19
20def demonstrate_svd():
21 """Demonstrate Singular Value Decomposition"""
22 print("=" * 60)
23 print("SINGULAR VALUE DECOMPOSITION (SVD)")
24 print("=" * 60)
25
26 # Create a matrix
27 A = np.array([[1, 2, 3],
28 [4, 5, 6],
29 [7, 8, 9],
30 [10, 11, 12]], dtype=float)
31
32 print(f"\nOriginal matrix A (4x3):\n{A}")
33 print(f"Shape: {A.shape}")
34 print(f"Rank: {np.linalg.matrix_rank(A)}")
35
36 # Perform SVD: A = U @ S @ V^T
37 U, s, Vt = np.linalg.svd(A, full_matrices=True)
38
39 print(f"\n--- SVD Components ---")
40 print(f"U shape: {U.shape} (left singular vectors)")
41 print(f"s shape: {s.shape} (singular values)")
42 print(f"V^T shape: {Vt.shape} (right singular vectors)")
43
44 print(f"\nSingular values: {s}")
45
46 # Reconstruct S matrix (diagonal)
47 S = np.zeros((U.shape[1], Vt.shape[0]))
48 S[:len(s), :len(s)] = np.diag(s)
49
50 print(f"\nS matrix (diagonal):\n{S}")
51
52 # Reconstruct A
53 A_reconstructed = U @ S @ Vt
54 print(f"\nReconstructed A:\n{A_reconstructed}")
55 print(f"Reconstruction error: {np.linalg.norm(A - A_reconstructed):.10f}")
56
57 # Low-rank approximation
58 print("\n--- Low-Rank Approximation ---")
59 k = 2 # Keep only top 2 singular values
60
61 S_k = S.copy()
62 S_k[:, k:] = 0 # Zero out smaller singular values
63
64 A_approx = U @ S_k @ Vt
65 print(f"\nRank-{k} approximation:\n{A_approx}")
66
67 error = np.linalg.norm(A - A_approx)
68 print(f"Approximation error (Frobenius norm): {error:.4f}")
69
70 # Energy preserved
71 energy_original = np.sum(s**2)
72 energy_kept = np.sum(s[:k]**2)
73 print(f"\nEnergy preserved: {energy_kept/energy_original*100:.2f}%")
74
75
76def pca_from_scratch(X, n_components=2):
77 """
78 Implement PCA from scratch
79
80 Steps:
81 1. Center the data (subtract mean)
82 2. Compute covariance matrix
83 3. Compute eigenvalues and eigenvectors
84 4. Sort by eigenvalues (descending)
85 5. Project data onto top k eigenvectors
86 """
87 print("\n--- PCA From Scratch ---")
88
89 # Step 1: Center the data
90 mean = np.mean(X, axis=0)
91 X_centered = X - mean
92 print(f"Data shape: {X.shape}")
93 print(f"Mean: {mean}")
94
95 # Step 2: Compute covariance matrix
96 n_samples = X.shape[0]
97 cov_matrix = (X_centered.T @ X_centered) / (n_samples - 1)
98 print(f"\nCovariance matrix shape: {cov_matrix.shape}")
99 print(f"Covariance matrix:\n{cov_matrix}")
100
101 # Step 3: Compute eigenvalues and eigenvectors
102 eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
103
104 # Step 4: Sort by eigenvalues (descending)
105 idx = eigenvalues.argsort()[::-1]
106 eigenvalues = eigenvalues[idx]
107 eigenvectors = eigenvectors[:, idx]
108
109 print(f"\nEigenvalues: {eigenvalues}")
110 print(f"\nTop {n_components} eigenvectors (principal components):")
111 print(eigenvectors[:, :n_components])
112
113 # Step 5: Project data
114 X_pca = X_centered @ eigenvectors[:, :n_components]
115
116 # Explained variance
117 explained_variance = eigenvalues / np.sum(eigenvalues)
118 explained_variance_cumsum = np.cumsum(explained_variance)
119
120 print(f"\nExplained variance ratio: {explained_variance[:n_components]}")
121 print(f"Cumulative explained variance: {explained_variance_cumsum[:n_components]}")
122
123 return X_pca, eigenvectors[:, :n_components], explained_variance, mean
124
125
126def pca_with_sklearn(X, n_components=2):
127 """Use sklearn PCA for comparison"""
128 print("\n--- PCA with sklearn ---")
129
130 pca = SklearnPCA(n_components=n_components)
131 X_pca = pca.fit_transform(X)
132
133 print(f"Transformed shape: {X_pca.shape}")
134 print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
135 print(f"Cumulative variance: {np.cumsum(pca.explained_variance_ratio_)}")
136 print(f"\nPrincipal components (shape {pca.components_.shape}):")
137 print(pca.components_.T)
138
139 return X_pca, pca
140
141
142def demonstrate_pca_on_iris():
143 """Demonstrate PCA on Iris dataset"""
144 print("\n" + "=" * 60)
145 print("PCA ON IRIS DATASET")
146 print("=" * 60)
147
148 # Load Iris dataset
149 iris = load_iris()
150 X = iris.data # 150 samples, 4 features
151 y = iris.target
152 feature_names = iris.feature_names
153
154 print(f"\nIris dataset:")
155 print(f"Samples: {X.shape[0]}, Features: {X.shape[1]}")
156 print(f"Feature names: {feature_names}")
157 print(f"First 5 samples:\n{X[:5]}")
158
159 # Apply custom PCA
160 print("\n" + "=" * 60)
161 print("CUSTOM PCA")
162 print("=" * 60)
163 X_pca_custom, components_custom, explained_var, mean = pca_from_scratch(X, n_components=2)
164
165 # Apply sklearn PCA
166 print("\n" + "=" * 60)
167 print("SKLEARN PCA")
168 print("=" * 60)
169 X_pca_sklearn, pca_model = pca_with_sklearn(X, n_components=2)
170
171 # Verify they match (may differ in sign)
172 print("\n--- Verification ---")
173 print(f"Custom PCA result (first 5):\n{X_pca_custom[:5]}")
174 print(f"Sklearn PCA result (first 5):\n{X_pca_sklearn[:5]}")
175
176 # Check if they're the same (up to sign flip)
177 diff = np.abs(X_pca_custom) - np.abs(X_pca_sklearn)
178 print(f"\nMax absolute difference: {np.max(np.abs(diff)):.10f}")
179
180 return X_pca_sklearn, y, pca_model, X, feature_names
181
182
183def visualize_pca_results(X_pca, y, pca_model, X_original, feature_names):
184 """Visualize PCA results"""
185 print("\n" + "=" * 60)
186 print("VISUALIZING PCA RESULTS")
187 print("=" * 60)
188
189 fig = plt.figure(figsize=(15, 5))
190
191 # Plot 1: PCA projection (2D)
192 ax1 = fig.add_subplot(131)
193 scatter = ax1.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis',
194 edgecolors='k', s=50, alpha=0.7)
195 ax1.set_xlabel('PC1')
196 ax1.set_ylabel('PC2')
197 ax1.set_title('PCA Projection (2D)')
198 ax1.grid(True, alpha=0.3)
199 plt.colorbar(scatter, ax=ax1, label='Class')
200
201 # Plot 2: Explained variance
202 ax2 = fig.add_subplot(132)
203 explained_var = pca_model.explained_variance_ratio_
204 cumsum_var = np.cumsum(explained_var)
205
206 x_pos = np.arange(len(explained_var))
207 ax2.bar(x_pos, explained_var, alpha=0.7, label='Individual')
208 ax2.plot(x_pos, cumsum_var, 'ro-', label='Cumulative')
209 ax2.set_xlabel('Principal Component')
210 ax2.set_ylabel('Explained Variance Ratio')
211 ax2.set_title('Explained Variance by Component')
212 ax2.set_xticks(x_pos)
213 ax2.set_xticklabels([f'PC{i+1}' for i in x_pos])
214 ax2.legend()
215 ax2.grid(True, alpha=0.3, axis='y')
216
217 # Plot 3: Component loadings
218 ax3 = fig.add_subplot(133)
219 components = pca_model.components_.T # Transpose to get (features, components)
220
221 x_pos = np.arange(len(feature_names))
222 width = 0.35
223
224 bars1 = ax3.bar(x_pos - width/2, components[:, 0], width,
225 label='PC1', alpha=0.7)
226 bars2 = ax3.bar(x_pos + width/2, components[:, 1], width,
227 label='PC2', alpha=0.7)
228
229 ax3.set_xlabel('Features')
230 ax3.set_ylabel('Loading')
231 ax3.set_title('Principal Component Loadings')
232 ax3.set_xticks(x_pos)
233 ax3.set_xticklabels([name.split()[0] for name in feature_names], rotation=45, ha='right')
234 ax3.legend()
235 ax3.grid(True, alpha=0.3, axis='y')
236 ax3.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
237
238 plt.tight_layout()
239 plt.savefig('/opt/projects/01_Personal/03_Study/examples/Math_for_AI/pca_visualization.png', dpi=150)
240 print("Visualization saved to pca_visualization.png")
241 plt.close()
242
243
244def svd_vs_pca():
245 """Show relationship between SVD and PCA"""
246 print("\n" + "=" * 60)
247 print("RELATIONSHIP BETWEEN SVD AND PCA")
248 print("=" * 60)
249
250 # Generate random data
251 np.random.seed(42)
252 X = np.random.randn(100, 5)
253
254 # Center the data
255 X_centered = X - np.mean(X, axis=0)
256
257 print(f"Data shape: {X.shape}")
258
259 # Method 1: PCA via eigendecomposition of covariance
260 cov = (X_centered.T @ X_centered) / (X.shape[0] - 1)
261 eigenvalues, eigenvectors = np.linalg.eig(cov)
262 idx = eigenvalues.argsort()[::-1]
263 eigenvalues = eigenvalues[idx]
264 eigenvectors = eigenvectors[:, idx]
265
266 print("\n--- PCA via Eigendecomposition ---")
267 print(f"Eigenvalues: {eigenvalues}")
268
269 # Method 2: PCA via SVD
270 U, s, Vt = np.linalg.svd(X_centered, full_matrices=False)
271
272 # Singular values squared relate to eigenvalues
273 eigenvalues_from_svd = (s**2) / (X.shape[0] - 1)
274
275 print("\n--- PCA via SVD ---")
276 print(f"Eigenvalues from SVD: {eigenvalues_from_svd}")
277
278 # V^T from SVD gives principal components
279 print("\n--- Comparison ---")
280 print(f"Eigenvalues match: {np.allclose(eigenvalues, eigenvalues_from_svd)}")
281 print(f"Eigenvectors match (up to sign): {np.allclose(np.abs(eigenvectors), np.abs(Vt.T))}")
282
283 print("\nKey insight:")
284 print("- SVD of centered data X = U @ S @ V^T")
285 print("- V^T contains principal components (eigenvectors of X^T X)")
286 print("- Singular values s relate to eigenvalues: eigenvalue = s^2 / (n-1)")
287
288
289if __name__ == "__main__":
290 # Run demonstrations
291 demonstrate_svd()
292
293 print("\n")
294 X_pca, y, pca_model, X_original, feature_names = demonstrate_pca_on_iris()
295
296 visualize_pca_results(X_pca, y, pca_model, X_original, feature_names)
297
298 svd_vs_pca()
299
300 print("\n" + "=" * 60)
301 print("All demonstrations completed!")
302 print("=" * 60)