1"""
2Matrix Calculus and Automatic Differentiation
3
4Demonstrates:
5- Manual gradient computation for simple functions
6- Jacobian and Hessian computation with NumPy
7- PyTorch autograd for comparison
8- Numerical gradient checking
9- ML application: computing gradients of MSE loss
10
11Dependencies: numpy, torch, matplotlib
12"""
13
14import numpy as np
15import torch
16import matplotlib.pyplot as plt
17
18
19def manual_gradients():
20 """Compute gradients manually for simple functions"""
21 print("=" * 60)
22 print("MANUAL GRADIENT COMPUTATION")
23 print("=" * 60)
24
25 # f(x) = x^2
26 print("\n--- f(x) = x^2 ---")
27 x = 3.0
28 f_x = x**2
29 df_dx = 2*x # Derivative: f'(x) = 2x
30
31 print(f"x = {x}")
32 print(f"f(x) = {f_x}")
33 print(f"f'(x) = {df_dx}")
34
35 # Multivariate: f(x, y) = x^2 + 2xy + y^2
36 print("\n--- f(x, y) = x^2 + 2xy + y^2 ---")
37 x, y = 2.0, 3.0
38 f_xy = x**2 + 2*x*y + y**2
39
40 # Partial derivatives
41 df_dx = 2*x + 2*y # ∂f/∂x = 2x + 2y
42 df_dy = 2*x + 2*y # ∂f/∂y = 2x + 2y
43
44 print(f"x = {x}, y = {y}")
45 print(f"f(x, y) = {f_xy}")
46 print(f"∂f/∂x = {df_dx}")
47 print(f"∂f/∂y = {df_dy}")
48 print(f"Gradient: ∇f = [{df_dx}, {df_dy}]")
49
50 # Vector function: f(x) = ||x||^2 = x^T x
51 print("\n--- f(x) = ||x||^2 = x^T x ---")
52 x = np.array([1.0, 2.0, 3.0])
53 f_x = np.dot(x, x)
54 grad_f = 2 * x # Gradient: ∇f = 2x
55
56 print(f"x = {x}")
57 print(f"f(x) = {f_x}")
58 print(f"∇f = {grad_f}")
59
60
61def compute_jacobian():
62 """Compute Jacobian matrix for vector-valued functions"""
63 print("\n" + "=" * 60)
64 print("JACOBIAN MATRIX")
65 print("=" * 60)
66
67 print("\nJacobian: For f: R^n → R^m, J is m×n matrix of partial derivatives")
68 print("J[i,j] = ∂f_i/∂x_j")
69
70 # Example: f(x, y) = [x^2 + y, xy, y^2]
71 # Input: R^2, Output: R^3
72 print("\n--- Example: f(x, y) = [x^2 + y, xy, y^2] ---")
73
74 def f(x, y):
75 return np.array([
76 x**2 + y,
77 x * y,
78 y**2
79 ])
80
81 # Analytical Jacobian
82 def jacobian_analytical(x, y):
83 return np.array([
84 [2*x, 1], # ∂f1/∂x, ∂f1/∂y
85 [y, x], # ∂f2/∂x, ∂f2/∂y
86 [0, 2*y] # ∂f3/∂x, ∂f3/∂y
87 ])
88
89 x, y = 2.0, 3.0
90 f_val = f(x, y)
91 J = jacobian_analytical(x, y)
92
93 print(f"\nAt point (x, y) = ({x}, {y})")
94 print(f"f(x, y) = {f_val}")
95 print(f"\nJacobian (3×2):\n{J}")
96
97 # Numerical Jacobian (finite differences)
98 def numerical_jacobian(func, x, y, h=1e-7):
99 f0 = func(x, y)
100 m = len(f0) # output dimension
101
102 J_num = np.zeros((m, 2))
103
104 # ∂f/∂x
105 f_plus_x = func(x + h, y)
106 J_num[:, 0] = (f_plus_x - f0) / h
107
108 # ∂f/∂y
109 f_plus_y = func(x, y + h)
110 J_num[:, 1] = (f_plus_y - f0) / h
111
112 return J_num
113
114 J_num = numerical_jacobian(f, x, y)
115 print(f"\nNumerical Jacobian:\n{J_num}")
116 print(f"Max difference: {np.max(np.abs(J - J_num)):.10f}")
117
118
119def compute_hessian():
120 """Compute Hessian matrix (second derivatives)"""
121 print("\n" + "=" * 60)
122 print("HESSIAN MATRIX")
123 print("=" * 60)
124
125 print("\nHessian: For f: R^n → R, H is n×n matrix of second partial derivatives")
126 print("H[i,j] = ∂²f/(∂x_i ∂x_j)")
127
128 # Example: f(x, y) = x^2 + 2xy + 3y^2
129 print("\n--- Example: f(x, y) = x^2 + 2xy + 3y^2 ---")
130
131 def f(x, y):
132 return x**2 + 2*x*y + 3*y**2
133
134 # Analytical Hessian
135 def hessian_analytical(x, y):
136 return np.array([
137 [2, 2], # ∂²f/∂x², ∂²f/∂x∂y
138 [2, 6] # ∂²f/∂y∂x, ∂²f/∂y²
139 ])
140
141 x, y = 1.0, 2.0
142 f_val = f(x, y)
143 H = hessian_analytical(x, y)
144
145 print(f"\nAt point (x, y) = ({x}, {y})")
146 print(f"f(x, y) = {f_val}")
147 print(f"\nHessian (2×2):\n{H}")
148
149 # Check symmetry (Schwarz's theorem)
150 print(f"Symmetric: {np.allclose(H, H.T)}")
151
152 # Eigenvalues for convexity analysis
153 eigenvalues = np.linalg.eigvals(H)
154 print(f"\nEigenvalues: {eigenvalues}")
155
156 if np.all(eigenvalues > 0):
157 print("Hessian is positive definite → f is strictly convex")
158 elif np.all(eigenvalues >= 0):
159 print("Hessian is positive semidefinite → f is convex")
160 else:
161 print("Hessian has negative eigenvalues → f is not convex")
162
163
164def pytorch_autograd():
165 """Demonstrate PyTorch automatic differentiation"""
166 print("\n" + "=" * 60)
167 print("PYTORCH AUTOMATIC DIFFERENTIATION")
168 print("=" * 60)
169
170 # Scalar function
171 print("\n--- Scalar Function: f(x) = x^2 + 2x + 1 ---")
172 x = torch.tensor(3.0, requires_grad=True)
173 f = x**2 + 2*x + 1
174
175 print(f"x = {x.item()}")
176 print(f"f(x) = {f.item()}")
177
178 # Compute gradient
179 f.backward()
180 print(f"df/dx = {x.grad.item()}")
181 print(f"Expected: 2*3 + 2 = {2*3 + 2}")
182
183 # Vector function
184 print("\n--- Vector Function ---")
185 x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
186 f = torch.sum(x**2) # f(x) = ||x||^2
187
188 print(f"x = {x.detach().numpy()}")
189 print(f"f(x) = {f.item()}")
190
191 f.backward()
192 print(f"∇f = {x.grad.numpy()}")
193 print(f"Expected: 2*x = {2*x.detach().numpy()}")
194
195 # Matrix operations
196 print("\n--- Matrix Operations ---")
197 W = torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)
198 x = torch.tensor([1.0, -1.0])
199
200 # f(W) = ||Wx||^2
201 y = W @ x
202 f = torch.sum(y**2)
203
204 print(f"W =\n{W.detach().numpy()}")
205 print(f"x = {x.numpy()}")
206 print(f"y = Wx = {y.detach().numpy()}")
207 print(f"f = ||Wx||^2 = {f.item()}")
208
209 f.backward()
210 print(f"\n∂f/∂W =\n{W.grad.numpy()}")
211
212
213def numerical_gradient_checking():
214 """Verify gradients using numerical approximation"""
215 print("\n" + "=" * 60)
216 print("NUMERICAL GRADIENT CHECKING")
217 print("=" * 60)
218
219 print("\nFinite difference approximation:")
220 print("f'(x) ≈ [f(x + h) - f(x - h)] / (2h)")
221
222 def f(x):
223 return np.sum(x**3 - 2*x**2 + x)
224
225 def grad_analytical(x):
226 return 3*x**2 - 4*x + 1
227
228 def grad_numerical(x, h=1e-5):
229 grad = np.zeros_like(x)
230 for i in range(len(x)):
231 x_plus = x.copy()
232 x_plus[i] += h
233 x_minus = x.copy()
234 x_minus[i] -= h
235
236 grad[i] = (f(x_plus) - f(x_minus)) / (2*h)
237 return grad
238
239 x = np.array([1.0, 2.0, 3.0])
240
241 grad_ana = grad_analytical(x)
242 grad_num = grad_numerical(x)
243
244 print(f"\nAt x = {x}")
245 print(f"f(x) = {f(x)}")
246 print(f"Analytical gradient: {grad_ana}")
247 print(f"Numerical gradient: {grad_num}")
248 print(f"Max difference: {np.max(np.abs(grad_ana - grad_num)):.10f}")
249
250 if np.allclose(grad_ana, grad_num, rtol=1e-4):
251 print("✓ Gradient check PASSED")
252 else:
253 print("✗ Gradient check FAILED")
254
255
256def mse_loss_gradients():
257 """Compute gradients of MSE loss for linear regression"""
258 print("\n" + "=" * 60)
259 print("ML APPLICATION: MSE LOSS GRADIENTS")
260 print("=" * 60)
261
262 # Linear regression: y_pred = Wx + b
263 # Loss: L = (1/n) Σ (y_pred - y_true)^2
264
265 print("\n--- Manual Computation ---")
266 # Data
267 X = np.array([[1.0, 2.0],
268 [2.0, 3.0],
269 [3.0, 4.0],
270 [4.0, 5.0]])
271 y_true = np.array([3.0, 5.0, 7.0, 9.0])
272
273 # Parameters
274 W = np.array([1.0, 0.5])
275 b = 0.5
276
277 n = len(y_true)
278
279 # Forward pass
280 y_pred = X @ W + b
281 loss = np.mean((y_pred - y_true)**2)
282
283 print(f"X shape: {X.shape}")
284 print(f"y_true: {y_true}")
285 print(f"W: {W}, b: {b}")
286 print(f"y_pred: {y_pred}")
287 print(f"MSE Loss: {loss:.4f}")
288
289 # Backward pass (gradients)
290 # dL/dW = (2/n) X^T (y_pred - y_true)
291 # dL/db = (2/n) Σ (y_pred - y_true)
292
293 error = y_pred - y_true
294 dL_dW = (2.0 / n) * X.T @ error
295 dL_db = (2.0 / n) * np.sum(error)
296
297 print(f"\nGradients (manual):")
298 print(f"dL/dW = {dL_dW}")
299 print(f"dL/db = {dL_db:.4f}")
300
301 # PyTorch computation
302 print("\n--- PyTorch Autograd ---")
303 X_torch = torch.tensor(X, dtype=torch.float32)
304 y_true_torch = torch.tensor(y_true, dtype=torch.float32)
305 W_torch = torch.tensor(W, dtype=torch.float32, requires_grad=True)
306 b_torch = torch.tensor(b, dtype=torch.float32, requires_grad=True)
307
308 y_pred_torch = X_torch @ W_torch + b_torch
309 loss_torch = torch.mean((y_pred_torch - y_true_torch)**2)
310
311 loss_torch.backward()
312
313 print(f"Loss: {loss_torch.item():.4f}")
314 print(f"dL/dW = {W_torch.grad.numpy()}")
315 print(f"dL/db = {b_torch.grad.item():.4f}")
316
317 print("\n--- Verification ---")
318 print(f"W gradients match: {np.allclose(dL_dW, W_torch.grad.numpy())}")
319 print(f"b gradients match: {np.isclose(dL_db, b_torch.grad.item())}")
320
321
322def visualize_gradient_descent():
323 """Visualize gradient descent on a simple function"""
324 print("\n" + "=" * 60)
325 print("GRADIENT DESCENT VISUALIZATION")
326 print("=" * 60)
327
328 # Function: f(x, y) = x^2 + 4y^2 (elliptic paraboloid)
329 def f(x, y):
330 return x**2 + 4*y**2
331
332 def grad_f(x, y):
333 return np.array([2*x, 8*y])
334
335 # Starting point
336 x0 = np.array([3.0, 2.0])
337 learning_rate = 0.1
338 n_iterations = 20
339
340 # Gradient descent
341 trajectory = [x0.copy()]
342 x = x0.copy()
343
344 for i in range(n_iterations):
345 grad = grad_f(x[0], x[1])
346 x = x - learning_rate * grad
347 trajectory.append(x.copy())
348
349 trajectory = np.array(trajectory)
350
351 print(f"Starting point: {x0}")
352 print(f"Final point: {trajectory[-1]}")
353 print(f"Final loss: {f(trajectory[-1][0], trajectory[-1][1]):.6f}")
354
355 # Visualization
356 fig, ax = plt.subplots(figsize=(10, 8))
357
358 # Create meshgrid for contour plot
359 x_range = np.linspace(-4, 4, 100)
360 y_range = np.linspace(-3, 3, 100)
361 X, Y = np.meshgrid(x_range, y_range)
362 Z = f(X, Y)
363
364 # Contour plot
365 contour = ax.contour(X, Y, Z, levels=20, cmap='viridis', alpha=0.6)
366 ax.clabel(contour, inline=True, fontsize=8)
367
368 # Gradient descent trajectory
369 ax.plot(trajectory[:, 0], trajectory[:, 1], 'ro-', linewidth=2,
370 markersize=6, label='Gradient Descent', alpha=0.7)
371 ax.plot(trajectory[0, 0], trajectory[0, 1], 'go', markersize=12,
372 label='Start', zorder=5)
373 ax.plot(trajectory[-1, 0], trajectory[-1, 1], 'r*', markersize=15,
374 label='End', zorder=5)
375
376 # Arrows for gradients
377 for i in range(0, len(trajectory)-1, 3):
378 dx = trajectory[i+1, 0] - trajectory[i, 0]
379 dy = trajectory[i+1, 1] - trajectory[i, 1]
380 ax.arrow(trajectory[i, 0], trajectory[i, 1], dx, dy,
381 head_width=0.15, head_length=0.1, fc='red', ec='red', alpha=0.5)
382
383 ax.set_xlabel('x')
384 ax.set_ylabel('y')
385 ax.set_title('Gradient Descent on f(x,y) = x² + 4y²')
386 ax.legend()
387 ax.grid(True, alpha=0.3)
388 ax.set_aspect('equal')
389
390 plt.tight_layout()
391 plt.savefig('/opt/projects/01_Personal/03_Study/examples/Math_for_AI/gradient_descent.png', dpi=150)
392 print("\nVisualization saved to gradient_descent.png")
393 plt.close()
394
395
396if __name__ == "__main__":
397 # Run all demonstrations
398 manual_gradients()
399 compute_jacobian()
400 compute_hessian()
401 pytorch_autograd()
402 numerical_gradient_checking()
403 mse_loss_gradients()
404 visualize_gradient_descent()
405
406 print("\n" + "=" * 60)
407 print("All demonstrations completed!")
408 print("=" * 60)