1"""
2Gradient Descent Optimization Algorithms
3
4This script implements various gradient descent optimizers from scratch:
5- Gradient Descent (GD)
6- Stochastic Gradient Descent (SGD)
7- Momentum
8- Adam
9
10We optimize the Rosenbrock function: f(x,y) = (a-x)^2 + b(y-x^2)^2
11This is a classic non-convex test function with a narrow valley.
12
13Author: Math for AI Examples
14"""
15
16import numpy as np
17import matplotlib.pyplot as plt
18from typing import Callable, Tuple, List
19
20
21class GradientDescent:
22 """Basic Gradient Descent optimizer."""
23
24 def __init__(self, learning_rate: float = 0.001):
25 self.lr = learning_rate
26
27 def step(self, params: np.ndarray, grads: np.ndarray) -> np.ndarray:
28 """Update parameters using gradient descent."""
29 return params - self.lr * grads
30
31
32class Momentum:
33 """Gradient Descent with Momentum."""
34
35 def __init__(self, learning_rate: float = 0.001, beta: float = 0.9):
36 self.lr = learning_rate
37 self.beta = beta
38 self.velocity = None
39
40 def step(self, params: np.ndarray, grads: np.ndarray) -> np.ndarray:
41 """Update parameters using momentum."""
42 if self.velocity is None:
43 self.velocity = np.zeros_like(params)
44
45 # v_t = beta * v_{t-1} + grad
46 self.velocity = self.beta * self.velocity + grads
47 # theta = theta - lr * v_t
48 return params - self.lr * self.velocity
49
50
51class Adam:
52 """Adam optimizer (Adaptive Moment Estimation)."""
53
54 def __init__(self, learning_rate: float = 0.001, beta1: float = 0.9,
55 beta2: float = 0.999, epsilon: float = 1e-8):
56 self.lr = learning_rate
57 self.beta1 = beta1
58 self.beta2 = beta2
59 self.epsilon = epsilon
60 self.m = None # First moment (mean)
61 self.v = None # Second moment (variance)
62 self.t = 0 # Timestep
63
64 def step(self, params: np.ndarray, grads: np.ndarray) -> np.ndarray:
65 """Update parameters using Adam."""
66 if self.m is None:
67 self.m = np.zeros_like(params)
68 self.v = np.zeros_like(params)
69
70 self.t += 1
71
72 # Update biased first and second moments
73 self.m = self.beta1 * self.m + (1 - self.beta1) * grads
74 self.v = self.beta2 * self.v + (1 - self.beta2) * (grads ** 2)
75
76 # Bias correction
77 m_hat = self.m / (1 - self.beta1 ** self.t)
78 v_hat = self.v / (1 - self.beta2 ** self.t)
79
80 # Update parameters
81 return params - self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)
82
83
84def rosenbrock(x: np.ndarray, a: float = 1.0, b: float = 100.0) -> float:
85 """
86 Rosenbrock function: f(x,y) = (a-x)^2 + b(y-x^2)^2
87 Global minimum at (a, a^2) with f(a, a^2) = 0
88 """
89 return (a - x[0])**2 + b * (x[1] - x[0]**2)**2
90
91
92def rosenbrock_gradient(x: np.ndarray, a: float = 1.0, b: float = 100.0) -> np.ndarray:
93 """
94 Gradient of Rosenbrock function.
95 df/dx = -2(a-x) - 4bx(y-x^2)
96 df/dy = 2b(y-x^2)
97 """
98 grad = np.zeros_like(x)
99 grad[0] = -2 * (a - x[0]) - 4 * b * x[0] * (x[1] - x[0]**2)
100 grad[1] = 2 * b * (x[1] - x[0]**2)
101 return grad
102
103
104def optimize(
105 optimizer,
106 initial_point: np.ndarray,
107 n_iterations: int = 1000,
108 a: float = 1.0,
109 b: float = 100.0
110) -> Tuple[List[np.ndarray], List[float]]:
111 """
112 Run optimization and track trajectory.
113
114 Returns:
115 trajectory: List of parameter values at each step
116 losses: List of loss values at each step
117 """
118 params = initial_point.copy()
119 trajectory = [params.copy()]
120 losses = [rosenbrock(params, a, b)]
121
122 for i in range(n_iterations):
123 grads = rosenbrock_gradient(params, a, b)
124 params = optimizer.step(params, grads)
125
126 trajectory.append(params.copy())
127 losses.append(rosenbrock(params, a, b))
128
129 return trajectory, losses
130
131
132def learning_rate_schedule_demo():
133 """Demonstrate learning rate scheduling strategies."""
134 print("\n" + "="*60)
135 print("Learning Rate Schedule Demonstration")
136 print("="*60)
137
138 n_iterations = 1000
139 initial_lr = 0.1
140
141 # Step decay: lr = lr_0 * decay_rate^(epoch / drop_every)
142 step_decay = lambda t: initial_lr * (0.5 ** (t // 100))
143
144 # Exponential decay: lr = lr_0 * exp(-decay_rate * t)
145 exp_decay = lambda t: initial_lr * np.exp(-0.005 * t)
146
147 # Cosine annealing: lr = lr_min + 0.5(lr_max - lr_min)(1 + cos(t*pi/T))
148 cosine_decay = lambda t: 0.001 + 0.5 * (initial_lr - 0.001) * \
149 (1 + np.cos(np.pi * t / n_iterations))
150
151 # Linear decay: lr = lr_0 * (1 - t/T)
152 linear_decay = lambda t: initial_lr * (1 - t / n_iterations)
153
154 steps = np.arange(n_iterations)
155
156 plt.figure(figsize=(10, 6))
157 plt.plot(steps, [step_decay(t) for t in steps], label='Step Decay', linewidth=2)
158 plt.plot(steps, [exp_decay(t) for t in steps], label='Exponential Decay', linewidth=2)
159 plt.plot(steps, [cosine_decay(t) for t in steps], label='Cosine Annealing', linewidth=2)
160 plt.plot(steps, [linear_decay(t) for t in steps], label='Linear Decay', linewidth=2)
161
162 plt.xlabel('Iteration', fontsize=12)
163 plt.ylabel('Learning Rate', fontsize=12)
164 plt.title('Learning Rate Schedules', fontsize=14, fontweight='bold')
165 plt.legend(fontsize=10)
166 plt.grid(True, alpha=0.3)
167 plt.tight_layout()
168 plt.savefig('learning_rate_schedules.png', dpi=150, bbox_inches='tight')
169 print("Saved learning rate schedules plot to 'learning_rate_schedules.png'")
170
171
172def compare_optimizers():
173 """Compare different optimization algorithms on Rosenbrock function."""
174 print("\n" + "="*60)
175 print("Comparing Optimization Algorithms on Rosenbrock Function")
176 print("="*60)
177
178 # Starting point
179 x0 = np.array([-1.0, 1.0])
180 n_iterations = 500
181
182 # Initialize optimizers
183 optimizers = {
184 'GD (lr=0.001)': GradientDescent(learning_rate=0.001),
185 'Momentum (lr=0.001)': Momentum(learning_rate=0.001, beta=0.9),
186 'Adam (lr=0.01)': Adam(learning_rate=0.01)
187 }
188
189 results = {}
190
191 # Run optimization for each optimizer
192 for name, optimizer in optimizers.items():
193 trajectory, losses = optimize(optimizer, x0, n_iterations)
194 results[name] = {
195 'trajectory': trajectory,
196 'losses': losses,
197 'final_point': trajectory[-1],
198 'final_loss': losses[-1]
199 }
200
201 print(f"\n{name}:")
202 print(f" Final point: ({trajectory[-1][0]:.6f}, {trajectory[-1][1]:.6f})")
203 print(f" Final loss: {losses[-1]:.6e}")
204 print(f" Target: (1.0, 1.0), Loss: 0.0")
205
206 # Visualization
207 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
208
209 # Plot 1: Contour plot with trajectories
210 x = np.linspace(-2, 2, 400)
211 y = np.linspace(-1, 3, 400)
212 X, Y = np.meshgrid(x, y)
213 Z = np.zeros_like(X)
214
215 for i in range(X.shape[0]):
216 for j in range(X.shape[1]):
217 Z[i, j] = rosenbrock(np.array([X[i, j], Y[i, j]]))
218
219 # Use log scale for better visualization
220 levels = np.logspace(-1, 3.5, 20)
221 contour = ax1.contour(X, Y, Z, levels=levels, cmap='viridis', alpha=0.6)
222 ax1.clabel(contour, inline=True, fontsize=8)
223
224 colors = ['red', 'blue', 'green']
225 for (name, result), color in zip(results.items(), colors):
226 traj = np.array(result['trajectory'])
227 ax1.plot(traj[:, 0], traj[:, 1], 'o-', color=color,
228 label=name, markersize=2, linewidth=1.5, alpha=0.7)
229 ax1.plot(traj[0, 0], traj[0, 1], 'ko', markersize=8, label='Start' if color == 'red' else '')
230
231 ax1.plot(1.0, 1.0, 'r*', markersize=15, label='Global Minimum')
232 ax1.set_xlabel('x', fontsize=12)
233 ax1.set_ylabel('y', fontsize=12)
234 ax1.set_title('Optimization Trajectories', fontsize=14, fontweight='bold')
235 ax1.legend(fontsize=9)
236 ax1.grid(True, alpha=0.3)
237
238 # Plot 2: Loss curves
239 for name, result in results.items():
240 ax2.semilogy(result['losses'], label=name, linewidth=2)
241
242 ax2.set_xlabel('Iteration', fontsize=12)
243 ax2.set_ylabel('Loss (log scale)', fontsize=12)
244 ax2.set_title('Convergence Comparison', fontsize=14, fontweight='bold')
245 ax2.legend(fontsize=10)
246 ax2.grid(True, alpha=0.3)
247
248 plt.tight_layout()
249 plt.savefig('optimizer_comparison.png', dpi=150, bbox_inches='tight')
250 print("\nSaved comparison plot to 'optimizer_comparison.png'")
251
252
253if __name__ == "__main__":
254 print("="*60)
255 print("Gradient Descent Optimization Examples")
256 print("="*60)
257
258 # Compare different optimizers
259 compare_optimizers()
260
261 # Demonstrate learning rate schedules
262 learning_rate_schedule_demo()
263
264 print("\n" + "="*60)
265 print("Key Takeaways:")
266 print("="*60)
267 print("1. Basic GD can be slow on narrow valleys (Rosenbrock)")
268 print("2. Momentum helps accelerate convergence in relevant directions")
269 print("3. Adam adapts learning rates per parameter, often converges fastest")
270 print("4. Learning rate schedules can improve final convergence")
271 print("5. Choice of optimizer depends on problem structure and constraints")