05_gradient_descent.py

  1"""
  2Gradient Descent Optimization Algorithms
  3
  4This script implements various gradient descent optimizers from scratch:
  5- Gradient Descent (GD)
  6- Stochastic Gradient Descent (SGD)
  7- Momentum
  8- Adam
  9
 10We optimize the Rosenbrock function: f(x,y) = (a-x)^2 + b(y-x^2)^2
 11This is a classic non-convex test function with a narrow valley.
 12
 13Author: Math for AI Examples
 14"""
 15
 16import numpy as np
 17import matplotlib.pyplot as plt
 18from typing import Callable, Tuple, List
 19
 20
 21class GradientDescent:
 22    """Basic Gradient Descent optimizer."""
 23
 24    def __init__(self, learning_rate: float = 0.001):
 25        self.lr = learning_rate
 26
 27    def step(self, params: np.ndarray, grads: np.ndarray) -> np.ndarray:
 28        """Update parameters using gradient descent."""
 29        return params - self.lr * grads
 30
 31
 32class Momentum:
 33    """Gradient Descent with Momentum."""
 34
 35    def __init__(self, learning_rate: float = 0.001, beta: float = 0.9):
 36        self.lr = learning_rate
 37        self.beta = beta
 38        self.velocity = None
 39
 40    def step(self, params: np.ndarray, grads: np.ndarray) -> np.ndarray:
 41        """Update parameters using momentum."""
 42        if self.velocity is None:
 43            self.velocity = np.zeros_like(params)
 44
 45        # v_t = beta * v_{t-1} + grad
 46        self.velocity = self.beta * self.velocity + grads
 47        # theta = theta - lr * v_t
 48        return params - self.lr * self.velocity
 49
 50
 51class Adam:
 52    """Adam optimizer (Adaptive Moment Estimation)."""
 53
 54    def __init__(self, learning_rate: float = 0.001, beta1: float = 0.9,
 55                 beta2: float = 0.999, epsilon: float = 1e-8):
 56        self.lr = learning_rate
 57        self.beta1 = beta1
 58        self.beta2 = beta2
 59        self.epsilon = epsilon
 60        self.m = None  # First moment (mean)
 61        self.v = None  # Second moment (variance)
 62        self.t = 0     # Timestep
 63
 64    def step(self, params: np.ndarray, grads: np.ndarray) -> np.ndarray:
 65        """Update parameters using Adam."""
 66        if self.m is None:
 67            self.m = np.zeros_like(params)
 68            self.v = np.zeros_like(params)
 69
 70        self.t += 1
 71
 72        # Update biased first and second moments
 73        self.m = self.beta1 * self.m + (1 - self.beta1) * grads
 74        self.v = self.beta2 * self.v + (1 - self.beta2) * (grads ** 2)
 75
 76        # Bias correction
 77        m_hat = self.m / (1 - self.beta1 ** self.t)
 78        v_hat = self.v / (1 - self.beta2 ** self.t)
 79
 80        # Update parameters
 81        return params - self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)
 82
 83
 84def rosenbrock(x: np.ndarray, a: float = 1.0, b: float = 100.0) -> float:
 85    """
 86    Rosenbrock function: f(x,y) = (a-x)^2 + b(y-x^2)^2
 87    Global minimum at (a, a^2) with f(a, a^2) = 0
 88    """
 89    return (a - x[0])**2 + b * (x[1] - x[0]**2)**2
 90
 91
 92def rosenbrock_gradient(x: np.ndarray, a: float = 1.0, b: float = 100.0) -> np.ndarray:
 93    """
 94    Gradient of Rosenbrock function.
 95    df/dx = -2(a-x) - 4bx(y-x^2)
 96    df/dy = 2b(y-x^2)
 97    """
 98    grad = np.zeros_like(x)
 99    grad[0] = -2 * (a - x[0]) - 4 * b * x[0] * (x[1] - x[0]**2)
100    grad[1] = 2 * b * (x[1] - x[0]**2)
101    return grad
102
103
104def optimize(
105    optimizer,
106    initial_point: np.ndarray,
107    n_iterations: int = 1000,
108    a: float = 1.0,
109    b: float = 100.0
110) -> Tuple[List[np.ndarray], List[float]]:
111    """
112    Run optimization and track trajectory.
113
114    Returns:
115        trajectory: List of parameter values at each step
116        losses: List of loss values at each step
117    """
118    params = initial_point.copy()
119    trajectory = [params.copy()]
120    losses = [rosenbrock(params, a, b)]
121
122    for i in range(n_iterations):
123        grads = rosenbrock_gradient(params, a, b)
124        params = optimizer.step(params, grads)
125
126        trajectory.append(params.copy())
127        losses.append(rosenbrock(params, a, b))
128
129    return trajectory, losses
130
131
132def learning_rate_schedule_demo():
133    """Demonstrate learning rate scheduling strategies."""
134    print("\n" + "="*60)
135    print("Learning Rate Schedule Demonstration")
136    print("="*60)
137
138    n_iterations = 1000
139    initial_lr = 0.1
140
141    # Step decay: lr = lr_0 * decay_rate^(epoch / drop_every)
142    step_decay = lambda t: initial_lr * (0.5 ** (t // 100))
143
144    # Exponential decay: lr = lr_0 * exp(-decay_rate * t)
145    exp_decay = lambda t: initial_lr * np.exp(-0.005 * t)
146
147    # Cosine annealing: lr = lr_min + 0.5(lr_max - lr_min)(1 + cos(t*pi/T))
148    cosine_decay = lambda t: 0.001 + 0.5 * (initial_lr - 0.001) * \
149                             (1 + np.cos(np.pi * t / n_iterations))
150
151    # Linear decay: lr = lr_0 * (1 - t/T)
152    linear_decay = lambda t: initial_lr * (1 - t / n_iterations)
153
154    steps = np.arange(n_iterations)
155
156    plt.figure(figsize=(10, 6))
157    plt.plot(steps, [step_decay(t) for t in steps], label='Step Decay', linewidth=2)
158    plt.plot(steps, [exp_decay(t) for t in steps], label='Exponential Decay', linewidth=2)
159    plt.plot(steps, [cosine_decay(t) for t in steps], label='Cosine Annealing', linewidth=2)
160    plt.plot(steps, [linear_decay(t) for t in steps], label='Linear Decay', linewidth=2)
161
162    plt.xlabel('Iteration', fontsize=12)
163    plt.ylabel('Learning Rate', fontsize=12)
164    plt.title('Learning Rate Schedules', fontsize=14, fontweight='bold')
165    plt.legend(fontsize=10)
166    plt.grid(True, alpha=0.3)
167    plt.tight_layout()
168    plt.savefig('learning_rate_schedules.png', dpi=150, bbox_inches='tight')
169    print("Saved learning rate schedules plot to 'learning_rate_schedules.png'")
170
171
172def compare_optimizers():
173    """Compare different optimization algorithms on Rosenbrock function."""
174    print("\n" + "="*60)
175    print("Comparing Optimization Algorithms on Rosenbrock Function")
176    print("="*60)
177
178    # Starting point
179    x0 = np.array([-1.0, 1.0])
180    n_iterations = 500
181
182    # Initialize optimizers
183    optimizers = {
184        'GD (lr=0.001)': GradientDescent(learning_rate=0.001),
185        'Momentum (lr=0.001)': Momentum(learning_rate=0.001, beta=0.9),
186        'Adam (lr=0.01)': Adam(learning_rate=0.01)
187    }
188
189    results = {}
190
191    # Run optimization for each optimizer
192    for name, optimizer in optimizers.items():
193        trajectory, losses = optimize(optimizer, x0, n_iterations)
194        results[name] = {
195            'trajectory': trajectory,
196            'losses': losses,
197            'final_point': trajectory[-1],
198            'final_loss': losses[-1]
199        }
200
201        print(f"\n{name}:")
202        print(f"  Final point: ({trajectory[-1][0]:.6f}, {trajectory[-1][1]:.6f})")
203        print(f"  Final loss: {losses[-1]:.6e}")
204        print(f"  Target: (1.0, 1.0), Loss: 0.0")
205
206    # Visualization
207    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
208
209    # Plot 1: Contour plot with trajectories
210    x = np.linspace(-2, 2, 400)
211    y = np.linspace(-1, 3, 400)
212    X, Y = np.meshgrid(x, y)
213    Z = np.zeros_like(X)
214
215    for i in range(X.shape[0]):
216        for j in range(X.shape[1]):
217            Z[i, j] = rosenbrock(np.array([X[i, j], Y[i, j]]))
218
219    # Use log scale for better visualization
220    levels = np.logspace(-1, 3.5, 20)
221    contour = ax1.contour(X, Y, Z, levels=levels, cmap='viridis', alpha=0.6)
222    ax1.clabel(contour, inline=True, fontsize=8)
223
224    colors = ['red', 'blue', 'green']
225    for (name, result), color in zip(results.items(), colors):
226        traj = np.array(result['trajectory'])
227        ax1.plot(traj[:, 0], traj[:, 1], 'o-', color=color,
228                label=name, markersize=2, linewidth=1.5, alpha=0.7)
229        ax1.plot(traj[0, 0], traj[0, 1], 'ko', markersize=8, label='Start' if color == 'red' else '')
230
231    ax1.plot(1.0, 1.0, 'r*', markersize=15, label='Global Minimum')
232    ax1.set_xlabel('x', fontsize=12)
233    ax1.set_ylabel('y', fontsize=12)
234    ax1.set_title('Optimization Trajectories', fontsize=14, fontweight='bold')
235    ax1.legend(fontsize=9)
236    ax1.grid(True, alpha=0.3)
237
238    # Plot 2: Loss curves
239    for name, result in results.items():
240        ax2.semilogy(result['losses'], label=name, linewidth=2)
241
242    ax2.set_xlabel('Iteration', fontsize=12)
243    ax2.set_ylabel('Loss (log scale)', fontsize=12)
244    ax2.set_title('Convergence Comparison', fontsize=14, fontweight='bold')
245    ax2.legend(fontsize=10)
246    ax2.grid(True, alpha=0.3)
247
248    plt.tight_layout()
249    plt.savefig('optimizer_comparison.png', dpi=150, bbox_inches='tight')
250    print("\nSaved comparison plot to 'optimizer_comparison.png'")
251
252
253if __name__ == "__main__":
254    print("="*60)
255    print("Gradient Descent Optimization Examples")
256    print("="*60)
257
258    # Compare different optimizers
259    compare_optimizers()
260
261    # Demonstrate learning rate schedules
262    learning_rate_schedule_demo()
263
264    print("\n" + "="*60)
265    print("Key Takeaways:")
266    print("="*60)
267    print("1. Basic GD can be slow on narrow valleys (Rosenbrock)")
268    print("2. Momentum helps accelerate convergence in relevant directions")
269    print("3. Adam adapts learning rates per parameter, often converges fastest")
270    print("4. Learning rate schedules can improve final convergence")
271    print("5. Choice of optimizer depends on problem structure and constraints")