03_backprop_scratch.py

  1"""
  203. 역전파 (Backpropagation) - NumPy 버전
  3
  4NumPy로 역전파를 직접 구현하여 원리를 이해합니다.
  5이 파일이 딥러닝 이해의 핵심입니다!
  6
  7PyTorch에서는 loss.backward() 한 줄이지만,
  8여기서는 체인 룰을 직접 적용합니다.
  9"""
 10
 11import numpy as np
 12import matplotlib.pyplot as plt
 13
 14print("=" * 60)
 15print("NumPy 역전파 (Backpropagation) from scratch")
 16print("=" * 60)
 17
 18
 19# ============================================
 20# 1. 활성화 함수와 그 미분
 21# ============================================
 22print("\n[1] 활성화 함수와 미분")
 23print("-" * 40)
 24
 25def sigmoid(x):
 26    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
 27
 28def sigmoid_derivative(x):
 29    """sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x))"""
 30    s = sigmoid(x)
 31    return s * (1 - s)
 32
 33def relu(x):
 34    return np.maximum(0, x)
 35
 36def relu_derivative(x):
 37    """relu'(x) = 1 if x > 0 else 0"""
 38    return (x > 0).astype(float)
 39
 40# 테스트
 41x = np.array([-2, -1, 0, 1, 2])
 42print(f"x: {x}")
 43print(f"sigmoid(x): {sigmoid(x).round(4)}")
 44print(f"sigmoid'(x): {sigmoid_derivative(x).round(4)}")
 45print(f"relu(x): {relu(x)}")
 46print(f"relu'(x): {relu_derivative(x)}")
 47
 48
 49# ============================================
 50# 2. 단일 뉴런 역전파 (이해용)
 51# ============================================
 52print("\n[2] 단일 뉴런 역전파")
 53print("-" * 40)
 54
 55class SingleNeuron:
 56    """
 57    단일 뉴런: y = sigmoid(w*x + b)
 58    손실: L = (y - target)^2
 59    """
 60    def __init__(self):
 61        self.w = np.random.randn()
 62        self.b = np.random.randn()
 63
 64    def forward(self, x, target):
 65        """순전파"""
 66        self.x = x
 67        self.target = target
 68
 69        # 단계별 계산 (캐시에 저장)
 70        self.z = self.w * x + self.b      # 선형 변환
 71        self.a = sigmoid(self.z)           # 활성화
 72        self.loss = (self.a - target) ** 2 # MSE
 73
 74        return self.a, self.loss
 75
 76    def backward(self):
 77        """
 78        역전파: 체인 룰 적용
 79
 80        dL/dw = (dL/da) * (da/dz) * (dz/dw)
 81        dL/db = (dL/da) * (da/dz) * (dz/db)
 82        """
 83        # 1. 손실 → 활성화
 84        dL_da = 2 * (self.a - self.target)
 85
 86        # 2. 활성화 → 선형 (시그모이드 미분)
 87        da_dz = sigmoid_derivative(self.z)
 88
 89        # 3. 선형 → 가중치/편향
 90        dz_dw = self.x
 91        dz_db = 1
 92
 93        # 체인 룰 적용
 94        dL_dw = dL_da * da_dz * dz_dw
 95        dL_db = dL_da * da_dz * dz_db
 96
 97        return dL_dw, dL_db
 98
 99# 테스트
100neuron = SingleNeuron()
101x, target = 2.0, 1.0
102
103print(f"입력: x={x}, target={target}")
104print(f"초기 가중치: w={neuron.w:.4f}, b={neuron.b:.4f}")
105
106pred, loss = neuron.forward(x, target)
107print(f"예측: {pred:.4f}, 손실: {loss:.4f}")
108
109dw, db = neuron.backward()
110print(f"기울기: dL/dw={dw:.4f}, dL/db={db:.4f}")
111
112
113# ============================================
114# 3. 2층 MLP 역전파 (핵심!)
115# ============================================
116print("\n[3] 2층 MLP 역전파")
117print("-" * 40)
118
119class MLPFromScratch:
120    """
121    2층 MLP with 역전파
122
123    구조: 입력 → [W1, b1] → ReLU → [W2, b2] → Sigmoid → 출력
124    """
125    def __init__(self, input_dim, hidden_dim, output_dim):
126        # Xavier 초기화
127        self.W1 = np.random.randn(input_dim, hidden_dim) * np.sqrt(2.0 / input_dim)
128        self.b1 = np.zeros(hidden_dim)
129        self.W2 = np.random.randn(hidden_dim, output_dim) * np.sqrt(2.0 / hidden_dim)
130        self.b2 = np.zeros(output_dim)
131
132        print(f"MLP 생성: {input_dim} → {hidden_dim} → {output_dim}")
133
134    def forward(self, X):
135        """순전파 (중간값 캐시)"""
136        # 첫 번째 층
137        self.z1 = X @ self.W1 + self.b1
138        self.a1 = relu(self.z1)
139
140        # 두 번째 층
141        self.z2 = self.a1 @ self.W2 + self.b2
142        self.a2 = sigmoid(self.z2)
143
144        return self.a2
145
146    def backward(self, X, y_true):
147        """
148        역전파: 체인 룰로 모든 기울기 계산
149
150        핵심 공식:
151        dL/dW2 = a1.T @ (dL/dz2)
152        dL/dW1 = X.T @ (dL/dz1)
153        """
154        m = X.shape[0]  # 배치 크기
155
156        # ===== 출력층 역전파 =====
157        # dL/da2 = 2(a2 - y) for MSE
158        dL_da2 = 2 * (self.a2 - y_true) / m
159
160        # dL/dz2 = dL/da2 * sigmoid'(z2)
161        dL_dz2 = dL_da2 * sigmoid_derivative(self.z2)
162
163        # dL/dW2 = a1.T @ dL/dz2
164        dW2 = self.a1.T @ dL_dz2
165        db2 = np.sum(dL_dz2, axis=0)
166
167        # ===== 은닉층 역전파 =====
168        # dL/da1 = dL/dz2 @ W2.T (기울기 역전파)
169        dL_da1 = dL_dz2 @ self.W2.T
170
171        # dL/dz1 = dL/da1 * relu'(z1)
172        dL_dz1 = dL_da1 * relu_derivative(self.z1)
173
174        # dL/dW1 = X.T @ dL/dz1
175        dW1 = X.T @ dL_dz1
176        db1 = np.sum(dL_dz1, axis=0)
177
178        return {'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2}
179
180    def update(self, grads, lr):
181        """경사 하강법으로 가중치 업데이트"""
182        self.W1 -= lr * grads['W1']
183        self.b1 -= lr * grads['b1']
184        self.W2 -= lr * grads['W2']
185        self.b2 -= lr * grads['b2']
186
187    def loss(self, y_pred, y_true):
188        """MSE 손실"""
189        return np.mean((y_pred - y_true) ** 2)
190
191
192# ============================================
193# 4. XOR 문제로 테스트
194# ============================================
195print("\n[4] XOR 문제 학습")
196print("-" * 40)
197
198# 데이터
199X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float64)
200y = np.array([[0], [1], [1], [0]], dtype=np.float64)
201
202# 모델 생성
203np.random.seed(42)
204mlp = MLPFromScratch(input_dim=2, hidden_dim=8, output_dim=1)
205
206# 학습
207learning_rate = 1.0
208epochs = 2000
209losses = []
210
211for epoch in range(epochs):
212    # 순전파
213    y_pred = mlp.forward(X)
214    loss = mlp.loss(y_pred, y)
215    losses.append(loss)
216
217    # 역전파
218    grads = mlp.backward(X, y)
219
220    # 가중치 업데이트
221    mlp.update(grads, learning_rate)
222
223    if (epoch + 1) % 400 == 0:
224        print(f"Epoch {epoch+1}: Loss = {loss:.6f}")
225
226# 결과 확인
227print("\n학습 결과:")
228y_final = mlp.forward(X)
229for i in range(4):
230    print(f"  {X[i]} → {y_final[i, 0]:.4f} (정답: {y[i, 0]})")
231
232# 손실 그래프
233plt.figure(figsize=(10, 5))
234plt.plot(losses)
235plt.xlabel('Epoch')
236plt.ylabel('Loss')
237plt.title('XOR Training Loss (NumPy Backprop)')
238plt.yscale('log')
239plt.grid(True, alpha=0.3)
240plt.savefig('numpy_xor_loss.png', dpi=100)
241plt.close()
242print("\n손실 그래프 저장: numpy_xor_loss.png")
243
244
245# ============================================
246# 5. 기울기 검증 (Gradient Checking)
247# ============================================
248print("\n[5] 기울기 검증")
249print("-" * 40)
250
251def numerical_gradient(model, X, y, param_name, h=1e-5):
252    """수치 미분으로 기울기 계산"""
253    param = getattr(model, param_name)
254    grad = np.zeros_like(param)
255
256    it = np.nditer(param, flags=['multi_index'])
257    while not it.finished:
258        idx = it.multi_index
259        original = param[idx]
260
261        # f(x + h)
262        param[idx] = original + h
263        loss_plus = model.loss(model.forward(X), y)
264
265        # f(x - h)
266        param[idx] = original - h
267        loss_minus = model.loss(model.forward(X), y)
268
269        # 수치 미분
270        grad[idx] = (loss_plus - loss_minus) / (2 * h)
271
272        param[idx] = original
273        it.iternext()
274
275    return grad
276
277# 작은 모델로 테스트
278np.random.seed(0)
279small_mlp = MLPFromScratch(2, 4, 1)
280
281# 순전파
282y_pred = small_mlp.forward(X)
283
284# 해석적 기울기 (역전파)
285analytical_grads = small_mlp.backward(X, y)
286
287# 수치적 기울기
288numerical_W1 = numerical_gradient(small_mlp, X, y, 'W1')
289numerical_W2 = numerical_gradient(small_mlp, X, y, 'W2')
290
291# 비교
292diff_W1 = np.linalg.norm(analytical_grads['W1'] - numerical_W1)
293diff_W2 = np.linalg.norm(analytical_grads['W2'] - numerical_W2)
294
295print(f"W1 기울기 차이: {diff_W1:.2e}")
296print(f"W2 기울기 차이: {diff_W2:.2e}")
297
298if diff_W1 < 1e-5 and diff_W2 < 1e-5:
299    print("✓ 기울기 검증 통과!")
300else:
301    print("✗ 기울기 검증 실패")
302
303
304# ============================================
305# 6. 체인 룰 시각화
306# ============================================
307print("\n[6] 체인 룰 흐름")
308print("-" * 40)
309
310chain_rule_diagram = """
311순전파 (Forward):
312    x ──▶ z1=xW1+b1 ──▶ a1=relu(z1) ──▶ z2=a1W2+b2 ──▶ a2=σ(z2) ──▶ L=MSE
313
314역전파 (Backward):
315    dL/dW1 ◀── dL/dz1 ◀── dL/da1 ◀── dL/dz2 ◀── dL/da2 ◀── dL/dL=1
316
317체인 룰 적용:
318    dL/dW2 = (dL/da2) × (da2/dz2) × (dz2/dW2)
319           = 2(a2-y) × σ'(z2) × a1.T
320
321    dL/dW1 = (dL/da2) × (da2/dz2) × (dz2/da1) × (da1/dz1) × (dz1/dW1)
322           = 2(a2-y) × σ'(z2) × W2.T × relu'(z1) × x.T
323"""
324print(chain_rule_diagram)
325
326
327# ============================================
328# 정리
329# ============================================
330print("\n" + "=" * 60)
331print("역전파 핵심 정리")
332print("=" * 60)
333
334summary = """
3351. 순전파: 입력 → 출력 방향으로 값 계산
3362. 손실 계산: 예측과 정답의 차이
3373. 역전파: 출력 → 입력 방향으로 기울기 계산 (체인 룰)
3384. 업데이트: W = W - lr × (dL/dW)
339
340핵심 공식:
341- 출력층: dL/dz2 = dL/da2 × σ'(z2)
342- 은닉층: dL/dz1 = (dL/dz2 @ W2.T) × relu'(z1)
343- 가중치: dL/dW = 이전층출력.T @ 현재층기울기
344
345PyTorch에서는:
346    loss.backward()  # 이 한 줄이 위의 모든 과정을 자동 수행!
347
348NumPy 구현의 가치:
3491. 행렬 곱셈의 전치 방향 이해
3502. 활성화 함수 미분의 역할 이해
3513. 배치 처리에서 합산이 필요한 이유 이해
352"""
353print(summary)
354print("=" * 60)