03_matrix_calculus_autograd.py

  1"""
  2Matrix Calculus and Automatic Differentiation
  3
  4Demonstrates:
  5- Manual gradient computation for simple functions
  6- Jacobian and Hessian computation with NumPy
  7- PyTorch autograd for comparison
  8- Numerical gradient checking
  9- ML application: computing gradients of MSE loss
 10
 11Dependencies: numpy, torch, matplotlib
 12"""
 13
 14import numpy as np
 15import torch
 16import matplotlib.pyplot as plt
 17
 18
 19def manual_gradients():
 20    """Compute gradients manually for simple functions"""
 21    print("=" * 60)
 22    print("MANUAL GRADIENT COMPUTATION")
 23    print("=" * 60)
 24
 25    # f(x) = x^2
 26    print("\n--- f(x) = x^2 ---")
 27    x = 3.0
 28    f_x = x**2
 29    df_dx = 2*x  # Derivative: f'(x) = 2x
 30
 31    print(f"x = {x}")
 32    print(f"f(x) = {f_x}")
 33    print(f"f'(x) = {df_dx}")
 34
 35    # Multivariate: f(x, y) = x^2 + 2xy + y^2
 36    print("\n--- f(x, y) = x^2 + 2xy + y^2 ---")
 37    x, y = 2.0, 3.0
 38    f_xy = x**2 + 2*x*y + y**2
 39
 40    # Partial derivatives
 41    df_dx = 2*x + 2*y  # ∂f/∂x = 2x + 2y
 42    df_dy = 2*x + 2*y  # ∂f/∂y = 2x + 2y
 43
 44    print(f"x = {x}, y = {y}")
 45    print(f"f(x, y) = {f_xy}")
 46    print(f"∂f/∂x = {df_dx}")
 47    print(f"∂f/∂y = {df_dy}")
 48    print(f"Gradient: ∇f = [{df_dx}, {df_dy}]")
 49
 50    # Vector function: f(x) = ||x||^2 = x^T x
 51    print("\n--- f(x) = ||x||^2 = x^T x ---")
 52    x = np.array([1.0, 2.0, 3.0])
 53    f_x = np.dot(x, x)
 54    grad_f = 2 * x  # Gradient: ∇f = 2x
 55
 56    print(f"x = {x}")
 57    print(f"f(x) = {f_x}")
 58    print(f"∇f = {grad_f}")
 59
 60
 61def compute_jacobian():
 62    """Compute Jacobian matrix for vector-valued functions"""
 63    print("\n" + "=" * 60)
 64    print("JACOBIAN MATRIX")
 65    print("=" * 60)
 66
 67    print("\nJacobian: For f: R^n → R^m, J is m×n matrix of partial derivatives")
 68    print("J[i,j] = ∂f_i/∂x_j")
 69
 70    # Example: f(x, y) = [x^2 + y, xy, y^2]
 71    # Input: R^2, Output: R^3
 72    print("\n--- Example: f(x, y) = [x^2 + y, xy, y^2] ---")
 73
 74    def f(x, y):
 75        return np.array([
 76            x**2 + y,
 77            x * y,
 78            y**2
 79        ])
 80
 81    # Analytical Jacobian
 82    def jacobian_analytical(x, y):
 83        return np.array([
 84            [2*x, 1],      # ∂f1/∂x, ∂f1/∂y
 85            [y, x],        # ∂f2/∂x, ∂f2/∂y
 86            [0, 2*y]       # ∂f3/∂x, ∂f3/∂y
 87        ])
 88
 89    x, y = 2.0, 3.0
 90    f_val = f(x, y)
 91    J = jacobian_analytical(x, y)
 92
 93    print(f"\nAt point (x, y) = ({x}, {y})")
 94    print(f"f(x, y) = {f_val}")
 95    print(f"\nJacobian (3×2):\n{J}")
 96
 97    # Numerical Jacobian (finite differences)
 98    def numerical_jacobian(func, x, y, h=1e-7):
 99        f0 = func(x, y)
100        m = len(f0)  # output dimension
101
102        J_num = np.zeros((m, 2))
103
104        # ∂f/∂x
105        f_plus_x = func(x + h, y)
106        J_num[:, 0] = (f_plus_x - f0) / h
107
108        # ∂f/∂y
109        f_plus_y = func(x, y + h)
110        J_num[:, 1] = (f_plus_y - f0) / h
111
112        return J_num
113
114    J_num = numerical_jacobian(f, x, y)
115    print(f"\nNumerical Jacobian:\n{J_num}")
116    print(f"Max difference: {np.max(np.abs(J - J_num)):.10f}")
117
118
119def compute_hessian():
120    """Compute Hessian matrix (second derivatives)"""
121    print("\n" + "=" * 60)
122    print("HESSIAN MATRIX")
123    print("=" * 60)
124
125    print("\nHessian: For f: R^n → R, H is n×n matrix of second partial derivatives")
126    print("H[i,j] = ∂²f/(∂x_i ∂x_j)")
127
128    # Example: f(x, y) = x^2 + 2xy + 3y^2
129    print("\n--- Example: f(x, y) = x^2 + 2xy + 3y^2 ---")
130
131    def f(x, y):
132        return x**2 + 2*x*y + 3*y**2
133
134    # Analytical Hessian
135    def hessian_analytical(x, y):
136        return np.array([
137            [2, 2],   # ∂²f/∂x², ∂²f/∂x∂y
138            [2, 6]    # ∂²f/∂y∂x, ∂²f/∂y²
139        ])
140
141    x, y = 1.0, 2.0
142    f_val = f(x, y)
143    H = hessian_analytical(x, y)
144
145    print(f"\nAt point (x, y) = ({x}, {y})")
146    print(f"f(x, y) = {f_val}")
147    print(f"\nHessian (2×2):\n{H}")
148
149    # Check symmetry (Schwarz's theorem)
150    print(f"Symmetric: {np.allclose(H, H.T)}")
151
152    # Eigenvalues for convexity analysis
153    eigenvalues = np.linalg.eigvals(H)
154    print(f"\nEigenvalues: {eigenvalues}")
155
156    if np.all(eigenvalues > 0):
157        print("Hessian is positive definite → f is strictly convex")
158    elif np.all(eigenvalues >= 0):
159        print("Hessian is positive semidefinite → f is convex")
160    else:
161        print("Hessian has negative eigenvalues → f is not convex")
162
163
164def pytorch_autograd():
165    """Demonstrate PyTorch automatic differentiation"""
166    print("\n" + "=" * 60)
167    print("PYTORCH AUTOMATIC DIFFERENTIATION")
168    print("=" * 60)
169
170    # Scalar function
171    print("\n--- Scalar Function: f(x) = x^2 + 2x + 1 ---")
172    x = torch.tensor(3.0, requires_grad=True)
173    f = x**2 + 2*x + 1
174
175    print(f"x = {x.item()}")
176    print(f"f(x) = {f.item()}")
177
178    # Compute gradient
179    f.backward()
180    print(f"df/dx = {x.grad.item()}")
181    print(f"Expected: 2*3 + 2 = {2*3 + 2}")
182
183    # Vector function
184    print("\n--- Vector Function ---")
185    x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
186    f = torch.sum(x**2)  # f(x) = ||x||^2
187
188    print(f"x = {x.detach().numpy()}")
189    print(f"f(x) = {f.item()}")
190
191    f.backward()
192    print(f"∇f = {x.grad.numpy()}")
193    print(f"Expected: 2*x = {2*x.detach().numpy()}")
194
195    # Matrix operations
196    print("\n--- Matrix Operations ---")
197    W = torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)
198    x = torch.tensor([1.0, -1.0])
199
200    # f(W) = ||Wx||^2
201    y = W @ x
202    f = torch.sum(y**2)
203
204    print(f"W =\n{W.detach().numpy()}")
205    print(f"x = {x.numpy()}")
206    print(f"y = Wx = {y.detach().numpy()}")
207    print(f"f = ||Wx||^2 = {f.item()}")
208
209    f.backward()
210    print(f"\n∂f/∂W =\n{W.grad.numpy()}")
211
212
213def numerical_gradient_checking():
214    """Verify gradients using numerical approximation"""
215    print("\n" + "=" * 60)
216    print("NUMERICAL GRADIENT CHECKING")
217    print("=" * 60)
218
219    print("\nFinite difference approximation:")
220    print("f'(x) ≈ [f(x + h) - f(x - h)] / (2h)")
221
222    def f(x):
223        return np.sum(x**3 - 2*x**2 + x)
224
225    def grad_analytical(x):
226        return 3*x**2 - 4*x + 1
227
228    def grad_numerical(x, h=1e-5):
229        grad = np.zeros_like(x)
230        for i in range(len(x)):
231            x_plus = x.copy()
232            x_plus[i] += h
233            x_minus = x.copy()
234            x_minus[i] -= h
235
236            grad[i] = (f(x_plus) - f(x_minus)) / (2*h)
237        return grad
238
239    x = np.array([1.0, 2.0, 3.0])
240
241    grad_ana = grad_analytical(x)
242    grad_num = grad_numerical(x)
243
244    print(f"\nAt x = {x}")
245    print(f"f(x) = {f(x)}")
246    print(f"Analytical gradient: {grad_ana}")
247    print(f"Numerical gradient:  {grad_num}")
248    print(f"Max difference: {np.max(np.abs(grad_ana - grad_num)):.10f}")
249
250    if np.allclose(grad_ana, grad_num, rtol=1e-4):
251        print("✓ Gradient check PASSED")
252    else:
253        print("✗ Gradient check FAILED")
254
255
256def mse_loss_gradients():
257    """Compute gradients of MSE loss for linear regression"""
258    print("\n" + "=" * 60)
259    print("ML APPLICATION: MSE LOSS GRADIENTS")
260    print("=" * 60)
261
262    # Linear regression: y_pred = Wx + b
263    # Loss: L = (1/n) Σ (y_pred - y_true)^2
264
265    print("\n--- Manual Computation ---")
266    # Data
267    X = np.array([[1.0, 2.0],
268                  [2.0, 3.0],
269                  [3.0, 4.0],
270                  [4.0, 5.0]])
271    y_true = np.array([3.0, 5.0, 7.0, 9.0])
272
273    # Parameters
274    W = np.array([1.0, 0.5])
275    b = 0.5
276
277    n = len(y_true)
278
279    # Forward pass
280    y_pred = X @ W + b
281    loss = np.mean((y_pred - y_true)**2)
282
283    print(f"X shape: {X.shape}")
284    print(f"y_true: {y_true}")
285    print(f"W: {W}, b: {b}")
286    print(f"y_pred: {y_pred}")
287    print(f"MSE Loss: {loss:.4f}")
288
289    # Backward pass (gradients)
290    # dL/dW = (2/n) X^T (y_pred - y_true)
291    # dL/db = (2/n) Σ (y_pred - y_true)
292
293    error = y_pred - y_true
294    dL_dW = (2.0 / n) * X.T @ error
295    dL_db = (2.0 / n) * np.sum(error)
296
297    print(f"\nGradients (manual):")
298    print(f"dL/dW = {dL_dW}")
299    print(f"dL/db = {dL_db:.4f}")
300
301    # PyTorch computation
302    print("\n--- PyTorch Autograd ---")
303    X_torch = torch.tensor(X, dtype=torch.float32)
304    y_true_torch = torch.tensor(y_true, dtype=torch.float32)
305    W_torch = torch.tensor(W, dtype=torch.float32, requires_grad=True)
306    b_torch = torch.tensor(b, dtype=torch.float32, requires_grad=True)
307
308    y_pred_torch = X_torch @ W_torch + b_torch
309    loss_torch = torch.mean((y_pred_torch - y_true_torch)**2)
310
311    loss_torch.backward()
312
313    print(f"Loss: {loss_torch.item():.4f}")
314    print(f"dL/dW = {W_torch.grad.numpy()}")
315    print(f"dL/db = {b_torch.grad.item():.4f}")
316
317    print("\n--- Verification ---")
318    print(f"W gradients match: {np.allclose(dL_dW, W_torch.grad.numpy())}")
319    print(f"b gradients match: {np.isclose(dL_db, b_torch.grad.item())}")
320
321
322def visualize_gradient_descent():
323    """Visualize gradient descent on a simple function"""
324    print("\n" + "=" * 60)
325    print("GRADIENT DESCENT VISUALIZATION")
326    print("=" * 60)
327
328    # Function: f(x, y) = x^2 + 4y^2 (elliptic paraboloid)
329    def f(x, y):
330        return x**2 + 4*y**2
331
332    def grad_f(x, y):
333        return np.array([2*x, 8*y])
334
335    # Starting point
336    x0 = np.array([3.0, 2.0])
337    learning_rate = 0.1
338    n_iterations = 20
339
340    # Gradient descent
341    trajectory = [x0.copy()]
342    x = x0.copy()
343
344    for i in range(n_iterations):
345        grad = grad_f(x[0], x[1])
346        x = x - learning_rate * grad
347        trajectory.append(x.copy())
348
349    trajectory = np.array(trajectory)
350
351    print(f"Starting point: {x0}")
352    print(f"Final point: {trajectory[-1]}")
353    print(f"Final loss: {f(trajectory[-1][0], trajectory[-1][1]):.6f}")
354
355    # Visualization
356    fig, ax = plt.subplots(figsize=(10, 8))
357
358    # Create meshgrid for contour plot
359    x_range = np.linspace(-4, 4, 100)
360    y_range = np.linspace(-3, 3, 100)
361    X, Y = np.meshgrid(x_range, y_range)
362    Z = f(X, Y)
363
364    # Contour plot
365    contour = ax.contour(X, Y, Z, levels=20, cmap='viridis', alpha=0.6)
366    ax.clabel(contour, inline=True, fontsize=8)
367
368    # Gradient descent trajectory
369    ax.plot(trajectory[:, 0], trajectory[:, 1], 'ro-', linewidth=2,
370            markersize=6, label='Gradient Descent', alpha=0.7)
371    ax.plot(trajectory[0, 0], trajectory[0, 1], 'go', markersize=12,
372            label='Start', zorder=5)
373    ax.plot(trajectory[-1, 0], trajectory[-1, 1], 'r*', markersize=15,
374            label='End', zorder=5)
375
376    # Arrows for gradients
377    for i in range(0, len(trajectory)-1, 3):
378        dx = trajectory[i+1, 0] - trajectory[i, 0]
379        dy = trajectory[i+1, 1] - trajectory[i, 1]
380        ax.arrow(trajectory[i, 0], trajectory[i, 1], dx, dy,
381                head_width=0.15, head_length=0.1, fc='red', ec='red', alpha=0.5)
382
383    ax.set_xlabel('x')
384    ax.set_ylabel('y')
385    ax.set_title('Gradient Descent on f(x,y) = x² + 4y²')
386    ax.legend()
387    ax.grid(True, alpha=0.3)
388    ax.set_aspect('equal')
389
390    plt.tight_layout()
391    plt.savefig('/opt/projects/01_Personal/03_Study/examples/Math_for_AI/gradient_descent.png', dpi=150)
392    print("\nVisualization saved to gradient_descent.png")
393    plt.close()
394
395
396if __name__ == "__main__":
397    # Run all demonstrations
398    manual_gradients()
399    compute_jacobian()
400    compute_hessian()
401    pytorch_autograd()
402    numerical_gradient_checking()
403    mse_loss_gradients()
404    visualize_gradient_descent()
405
406    print("\n" + "=" * 60)
407    print("All demonstrations completed!")
408    print("=" * 60)