04_norms_regularization.py

  1"""
  2Norms, Distance Metrics, and Regularization
  3
  4Demonstrates:
  5- Lp norms visualization (L1, L2, L∞)
  6- Distance metrics (Euclidean, Manhattan, Cosine, Mahalanobis)
  7- L1 vs L2 regularization effects on linear regression
  8- Sparsity demonstration
  9- ML applications: feature scaling, regularization
 10
 11Dependencies: numpy, matplotlib, sklearn
 12"""
 13
 14import numpy as np
 15import matplotlib.pyplot as plt
 16from sklearn.linear_model import Ridge, Lasso, LinearRegression
 17from sklearn.preprocessing import StandardScaler
 18from sklearn.metrics.pairwise import cosine_similarity
 19
 20
 21def demonstrate_lp_norms():
 22    """Demonstrate L1, L2, and L∞ norms"""
 23    print("=" * 60)
 24    print("Lp NORMS")
 25    print("=" * 60)
 26
 27    print("\nLp norm: ||x||_p = (Σ|x_i|^p)^(1/p)")
 28
 29    x = np.array([3, -4, 0, 2])
 30
 31    # L1 norm (Manhattan): sum of absolute values
 32    l1_norm = np.linalg.norm(x, ord=1)
 33    l1_manual = np.sum(np.abs(x))
 34
 35    # L2 norm (Euclidean): square root of sum of squares
 36    l2_norm = np.linalg.norm(x, ord=2)
 37    l2_manual = np.sqrt(np.sum(x**2))
 38
 39    # L∞ norm (Maximum): maximum absolute value
 40    linf_norm = np.linalg.norm(x, ord=np.inf)
 41    linf_manual = np.max(np.abs(x))
 42
 43    print(f"\nVector x = {x}")
 44    print(f"\nL1 norm (Manhattan):  ||x||_1 = {l1_norm:.4f} (manual: {l1_manual:.4f})")
 45    print(f"L2 norm (Euclidean):  ||x||_2 = {l2_norm:.4f} (manual: {l2_manual:.4f})")
 46    print(f"L∞ norm (Maximum):    ||x||_∞ = {linf_norm:.4f} (manual: {linf_manual:.4f})")
 47
 48    # Properties
 49    print("\n--- Norm Properties ---")
 50    print("1. Non-negativity: ||x|| ≥ 0")
 51    print("2. Definiteness: ||x|| = 0 iff x = 0")
 52    print("3. Homogeneity: ||αx|| = |α| · ||x||")
 53    print("4. Triangle inequality: ||x + y|| ≤ ||x|| + ||y||")
 54
 55    # Verify triangle inequality
 56    y = np.array([1, 2, -1, 3])
 57    lhs = np.linalg.norm(x + y, ord=2)
 58    rhs = np.linalg.norm(x, ord=2) + np.linalg.norm(y, ord=2)
 59    print(f"\nTriangle inequality verification (L2):")
 60    print(f"||x + y|| = {lhs:.4f}")
 61    print(f"||x|| + ||y|| = {rhs:.4f}")
 62    print(f"||x + y|| ≤ ||x|| + ||y||: {lhs <= rhs}")
 63
 64
 65def visualize_unit_balls():
 66    """Visualize unit balls for different norms"""
 67    print("\n" + "=" * 60)
 68    print("VISUALIZING UNIT BALLS")
 69    print("=" * 60)
 70
 71    print("\nUnit ball: {x : ||x||_p ≤ 1}")
 72
 73    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
 74
 75    # Generate points
 76    theta = np.linspace(0, 2*np.pi, 1000)
 77
 78    # L∞ norm: square
 79    ax = axes[0]
 80    square_x = np.array([1, 1, -1, -1, 1])
 81    square_y = np.array([1, -1, -1, 1, 1])
 82    ax.plot(square_x, square_y, 'b-', linewidth=2)
 83    ax.fill(square_x, square_y, alpha=0.3)
 84    ax.set_xlim(-1.5, 1.5)
 85    ax.set_ylim(-1.5, 1.5)
 86    ax.set_aspect('equal')
 87    ax.grid(True, alpha=0.3)
 88    ax.set_title('L∞ Norm (||x||∞ ≤ 1)')
 89    ax.set_xlabel('x₁')
 90    ax.set_ylabel('x₂')
 91
 92    # L2 norm: circle
 93    ax = axes[1]
 94    circle_x = np.cos(theta)
 95    circle_y = np.sin(theta)
 96    ax.plot(circle_x, circle_y, 'g-', linewidth=2)
 97    ax.fill(circle_x, circle_y, alpha=0.3)
 98    ax.set_xlim(-1.5, 1.5)
 99    ax.set_ylim(-1.5, 1.5)
100    ax.set_aspect('equal')
101    ax.grid(True, alpha=0.3)
102    ax.set_title('L2 Norm (||x||₂ ≤ 1)')
103    ax.set_xlabel('x₁')
104    ax.set_ylabel('x₂')
105
106    # L1 norm: diamond
107    ax = axes[2]
108    diamond_x = np.array([1, 0, -1, 0, 1])
109    diamond_y = np.array([0, 1, 0, -1, 0])
110    ax.plot(diamond_x, diamond_y, 'r-', linewidth=2)
111    ax.fill(diamond_x, diamond_y, alpha=0.3)
112    ax.set_xlim(-1.5, 1.5)
113    ax.set_ylim(-1.5, 1.5)
114    ax.set_aspect('equal')
115    ax.grid(True, alpha=0.3)
116    ax.set_title('L1 Norm (||x||₁ ≤ 1)')
117    ax.set_xlabel('x₁')
118    ax.set_ylabel('x₂')
119
120    plt.tight_layout()
121    plt.savefig('/opt/projects/01_Personal/03_Study/examples/Math_for_AI/unit_balls.png', dpi=150)
122    print("Unit ball visualization saved to unit_balls.png")
123    plt.close()
124
125
126def distance_metrics():
127    """Demonstrate various distance metrics"""
128    print("\n" + "=" * 60)
129    print("DISTANCE METRICS")
130    print("=" * 60)
131
132    x = np.array([1, 2, 3])
133    y = np.array([4, 5, 6])
134
135    print(f"Vector x: {x}")
136    print(f"Vector y: {y}")
137
138    # Euclidean distance (L2)
139    euclidean = np.linalg.norm(x - y, ord=2)
140    print(f"\n1. Euclidean distance (L2): {euclidean:.4f}")
141    print(f"   d(x, y) = ||x - y||₂ = sqrt(Σ(x_i - y_i)²)")
142
143    # Manhattan distance (L1)
144    manhattan = np.linalg.norm(x - y, ord=1)
145    print(f"\n2. Manhattan distance (L1): {manhattan:.4f}")
146    print(f"   d(x, y) = ||x - y||₁ = Σ|x_i - y_i|")
147
148    # Chebyshev distance (L∞)
149    chebyshev = np.linalg.norm(x - y, ord=np.inf)
150    print(f"\n3. Chebyshev distance (L∞): {chebyshev:.4f}")
151    print(f"   d(x, y) = ||x - y||∞ = max|x_i - y_i|")
152
153    # Cosine similarity/distance
154    cos_sim = np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
155    cos_dist = 1 - cos_sim
156    print(f"\n4. Cosine similarity: {cos_sim:.4f}")
157    print(f"   cos(θ) = (x·y) / (||x|| ||y||)")
158    print(f"   Cosine distance: {cos_dist:.4f}")
159
160    # Mahalanobis distance
161    print("\n5. Mahalanobis distance:")
162    print("   Accounts for covariance structure")
163
164    # Generate correlated data
165    np.random.seed(42)
166    mean = [0, 0]
167    cov = [[2, 1], [1, 2]]  # Covariance matrix
168    data = np.random.multivariate_normal(mean, cov, size=100)
169
170    # Compute covariance and its inverse
171    cov_matrix = np.cov(data.T)
172    cov_inv = np.linalg.inv(cov_matrix)
173
174    # Two points
175    p1 = np.array([1, 1])
176    p2 = np.array([2, 2])
177
178    # Mahalanobis distance
179    diff = p1 - p2
180    mahal_dist = np.sqrt(diff.T @ cov_inv @ diff)
181
182    print(f"   Point 1: {p1}")
183    print(f"   Point 2: {p2}")
184    print(f"   Mahalanobis distance: {mahal_dist:.4f}")
185    print(f"   Euclidean distance:   {np.linalg.norm(p1 - p2):.4f}")
186
187
188def regularization_comparison():
189    """Compare L1 (Lasso) and L2 (Ridge) regularization"""
190    print("\n" + "=" * 60)
191    print("L1 vs L2 REGULARIZATION")
192    print("=" * 60)
193
194    # Generate synthetic data with many features
195    np.random.seed(42)
196    n_samples = 100
197    n_features = 50
198
199    # Only first 5 features are truly relevant
200    X = np.random.randn(n_samples, n_features)
201    true_coef = np.zeros(n_features)
202    true_coef[:5] = [3.0, -2.0, 1.5, -1.0, 2.5]
203
204    y = X @ true_coef + np.random.randn(n_samples) * 0.5
205
206    print(f"\nDataset:")
207    print(f"Samples: {n_samples}, Features: {n_features}")
208    print(f"True non-zero coefficients: 5")
209
210    # Linear regression (no regularization)
211    lr = LinearRegression()
212    lr.fit(X, y)
213
214    # Ridge (L2 regularization)
215    ridge = Ridge(alpha=1.0)
216    ridge.fit(X, y)
217
218    # Lasso (L1 regularization)
219    lasso = Lasso(alpha=0.1)
220    lasso.fit(X, y)
221
222    print("\n--- Coefficient Analysis ---")
223    print(f"Linear Regression - Non-zero coefs: {np.sum(np.abs(lr.coef_) > 0.01)}")
224    print(f"Ridge (L2) - Non-zero coefs: {np.sum(np.abs(ridge.coef_) > 0.01)}")
225    print(f"Lasso (L1) - Non-zero coefs: {np.sum(np.abs(lasso.coef_) > 0.01)}")
226
227    print(f"\nL2 norm of coefficients:")
228    print(f"Linear Regression: {np.linalg.norm(lr.coef_, ord=2):.4f}")
229    print(f"Ridge (L2):        {np.linalg.norm(ridge.coef_, ord=2):.4f}")
230    print(f"Lasso (L1):        {np.linalg.norm(lasso.coef_, ord=2):.4f}")
231
232    print(f"\nL1 norm of coefficients:")
233    print(f"Linear Regression: {np.linalg.norm(lr.coef_, ord=1):.4f}")
234    print(f"Ridge (L2):        {np.linalg.norm(ridge.coef_, ord=1):.4f}")
235    print(f"Lasso (L1):        {np.linalg.norm(lasso.coef_, ord=1):.4f}")
236
237    print("\nKey insights:")
238    print("- L1 (Lasso) produces sparse solutions (many exact zeros)")
239    print("- L2 (Ridge) shrinks coefficients but rarely to exactly zero")
240    print("- L1 performs feature selection, L2 performs feature shrinkage")
241
242    return X, y, lr, ridge, lasso, true_coef
243
244
245def visualize_regularization(X, y, lr, ridge, lasso, true_coef):
246    """Visualize regularization effects"""
247    print("\n" + "=" * 60)
248    print("VISUALIZING REGULARIZATION EFFECTS")
249    print("=" * 60)
250
251    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
252
253    # Plot 1: Coefficient values
254    ax = axes[0, 0]
255    x_pos = np.arange(len(true_coef))
256
257    ax.plot(x_pos, true_coef, 'ko-', label='True', linewidth=2, markersize=4)
258    ax.plot(x_pos, lr.coef_, 'b.-', label='Linear Reg', alpha=0.7, markersize=3)
259    ax.plot(x_pos, ridge.coef_, 'g.-', label='Ridge (L2)', alpha=0.7, markersize=3)
260    ax.plot(x_pos, lasso.coef_, 'r.-', label='Lasso (L1)', alpha=0.7, markersize=3)
261
262    ax.set_xlabel('Feature Index')
263    ax.set_ylabel('Coefficient Value')
264    ax.set_title('Coefficient Values Comparison')
265    ax.legend()
266    ax.grid(True, alpha=0.3)
267    ax.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
268
269    # Plot 2: First 10 coefficients (zoomed)
270    ax = axes[0, 1]
271    n_show = 10
272    x_pos_zoom = np.arange(n_show)
273
274    width = 0.2
275    ax.bar(x_pos_zoom - 1.5*width, true_coef[:n_show], width, label='True', alpha=0.8)
276    ax.bar(x_pos_zoom - 0.5*width, lr.coef_[:n_show], width, label='Linear Reg', alpha=0.8)
277    ax.bar(x_pos_zoom + 0.5*width, ridge.coef_[:n_show], width, label='Ridge (L2)', alpha=0.8)
278    ax.bar(x_pos_zoom + 1.5*width, lasso.coef_[:n_show], width, label='Lasso (L1)', alpha=0.8)
279
280    ax.set_xlabel('Feature Index')
281    ax.set_ylabel('Coefficient Value')
282    ax.set_title(f'First {n_show} Coefficients (Zoomed)')
283    ax.set_xticks(x_pos_zoom)
284    ax.legend()
285    ax.grid(True, alpha=0.3, axis='y')
286    ax.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
287
288    # Plot 3: Sparsity pattern
289    ax = axes[1, 0]
290
291    threshold = 0.01
292    sparsity_data = np.array([
293        np.sum(np.abs(true_coef) > threshold),
294        np.sum(np.abs(lr.coef_) > threshold),
295        np.sum(np.abs(ridge.coef_) > threshold),
296        np.sum(np.abs(lasso.coef_) > threshold)
297    ])
298
299    bars = ax.bar(['True', 'Linear Reg', 'Ridge (L2)', 'Lasso (L1)'],
300                   sparsity_data, color=['black', 'blue', 'green', 'red'], alpha=0.7)
301
302    ax.set_ylabel('Number of Non-Zero Coefficients')
303    ax.set_title(f'Sparsity Comparison (threshold = {threshold})')
304    ax.grid(True, alpha=0.3, axis='y')
305
306    # Add values on bars
307    for bar, value in zip(bars, sparsity_data):
308        height = bar.get_height()
309        ax.text(bar.get_x() + bar.get_width()/2., height,
310                f'{int(value)}', ha='center', va='bottom')
311
312    # Plot 4: Regularization paths
313    ax = axes[1, 1]
314
315    # Compute coefficients for different alpha values
316    alphas = np.logspace(-3, 2, 50)
317    coefs_ridge = []
318    coefs_lasso = []
319
320    for alpha in alphas:
321        ridge_temp = Ridge(alpha=alpha)
322        ridge_temp.fit(X, y)
323        coefs_ridge.append(ridge_temp.coef_)
324
325        lasso_temp = Lasso(alpha=alpha)
326        lasso_temp.fit(X, y)
327        coefs_lasso.append(lasso_temp.coef_)
328
329    coefs_ridge = np.array(coefs_ridge)
330    coefs_lasso = np.array(coefs_lasso)
331
332    # Plot first 5 features only
333    for i in range(5):
334        ax.plot(alphas, coefs_lasso[:, i], '-', linewidth=2, label=f'Feature {i}')
335
336    ax.set_xscale('log')
337    ax.set_xlabel('Alpha (Regularization Strength)')
338    ax.set_ylabel('Coefficient Value')
339    ax.set_title('Lasso Regularization Path (First 5 Features)')
340    ax.legend()
341    ax.grid(True, alpha=0.3)
342    ax.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
343
344    plt.tight_layout()
345    plt.savefig('/opt/projects/01_Personal/03_Study/examples/Math_for_AI/regularization_comparison.png', dpi=150)
346    print("Regularization visualization saved to regularization_comparison.png")
347    plt.close()
348
349
350def demonstrate_sparsity():
351    """Demonstrate sparsity-inducing property of L1 norm"""
352    print("\n" + "=" * 60)
353    print("SPARSITY WITH L1 REGULARIZATION")
354    print("=" * 60)
355
356    print("\nWhy L1 induces sparsity:")
357    print("- L1 penalty: λΣ|w_i| has sharp corners at axes")
358    print("- L2 penalty: λΣw_i² is smooth everywhere")
359    print("- Optimization with L1 tends to hit corners (exact zeros)")
360
361    # Simple 2D example
362    print("\n--- 2D Example ---")
363
364    # Generate data where only x1 is relevant
365    np.random.seed(42)
366    n = 100
367    x1 = np.random.randn(n)
368    x2 = np.random.randn(n)
369    y = 3*x1 + np.random.randn(n) * 0.5  # Only x1 is relevant
370
371    X = np.column_stack([x1, x2])
372
373    # Fit models
374    lr = LinearRegression().fit(X, y)
375    ridge = Ridge(alpha=1.0).fit(X, y)
376    lasso = Lasso(alpha=0.1).fit(X, y)
377
378    print(f"\nTrue relationship: y = 3*x1 + noise")
379    print(f"\nLinear Regression coef: {lr.coef_}")
380    print(f"Ridge (L2) coef:        {ridge.coef_}")
381    print(f"Lasso (L1) coef:        {lasso.coef_}")
382
383    print(f"\nLasso correctly identifies x2 as irrelevant!")
384    print(f"x2 coefficient ≈ {lasso.coef_[1]:.6f} (effectively zero)")
385
386
387if __name__ == "__main__":
388    # Run all demonstrations
389    demonstrate_lp_norms()
390    visualize_unit_balls()
391    distance_metrics()
392
393    X, y, lr, ridge, lasso, true_coef = regularization_comparison()
394    visualize_regularization(X, y, lr, ridge, lasso, true_coef)
395
396    demonstrate_sparsity()
397
398    print("\n" + "=" * 60)
399    print("All demonstrations completed!")
400    print("=" * 60)