03_backprop_scratch.py

Download
python 355 lines 9.3 KB
  1"""
  203. ์—ญ์ „ํŒŒ (Backpropagation) - NumPy ๋ฒ„์ „
  3
  4NumPy๋กœ ์—ญ์ „ํŒŒ๋ฅผ ์ง์ ‘ ๊ตฌํ˜„ํ•˜์—ฌ ์›๋ฆฌ๋ฅผ ์ดํ•ดํ•ฉ๋‹ˆ๋‹ค.
  5์ด ํŒŒ์ผ์ด ๋”ฅ๋Ÿฌ๋‹ ์ดํ•ด์˜ ํ•ต์‹ฌ์ž…๋‹ˆ๋‹ค!
  6
  7PyTorch์—์„œ๋Š” loss.backward() ํ•œ ์ค„์ด์ง€๋งŒ,
  8์—ฌ๊ธฐ์„œ๋Š” ์ฒด์ธ ๋ฃฐ์„ ์ง์ ‘ ์ ์šฉํ•ฉ๋‹ˆ๋‹ค.
  9"""
 10
 11import numpy as np
 12import matplotlib.pyplot as plt
 13
 14print("=" * 60)
 15print("NumPy ์—ญ์ „ํŒŒ (Backpropagation) from scratch")
 16print("=" * 60)
 17
 18
 19# ============================================
 20# 1. ํ™œ์„ฑํ™” ํ•จ์ˆ˜์™€ ๊ทธ ๋ฏธ๋ถ„
 21# ============================================
 22print("\n[1] ํ™œ์„ฑํ™” ํ•จ์ˆ˜์™€ ๋ฏธ๋ถ„")
 23print("-" * 40)
 24
 25def sigmoid(x):
 26    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
 27
 28def sigmoid_derivative(x):
 29    """sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x))"""
 30    s = sigmoid(x)
 31    return s * (1 - s)
 32
 33def relu(x):
 34    return np.maximum(0, x)
 35
 36def relu_derivative(x):
 37    """relu'(x) = 1 if x > 0 else 0"""
 38    return (x > 0).astype(float)
 39
 40# ํ…Œ์ŠคํŠธ
 41x = np.array([-2, -1, 0, 1, 2])
 42print(f"x: {x}")
 43print(f"sigmoid(x): {sigmoid(x).round(4)}")
 44print(f"sigmoid'(x): {sigmoid_derivative(x).round(4)}")
 45print(f"relu(x): {relu(x)}")
 46print(f"relu'(x): {relu_derivative(x)}")
 47
 48
 49# ============================================
 50# 2. ๋‹จ์ผ ๋‰ด๋Ÿฐ ์—ญ์ „ํŒŒ (์ดํ•ด์šฉ)
 51# ============================================
 52print("\n[2] ๋‹จ์ผ ๋‰ด๋Ÿฐ ์—ญ์ „ํŒŒ")
 53print("-" * 40)
 54
 55class SingleNeuron:
 56    """
 57    ๋‹จ์ผ ๋‰ด๋Ÿฐ: y = sigmoid(w*x + b)
 58    ์†์‹ค: L = (y - target)^2
 59    """
 60    def __init__(self):
 61        self.w = np.random.randn()
 62        self.b = np.random.randn()
 63
 64    def forward(self, x, target):
 65        """์ˆœ์ „ํŒŒ"""
 66        self.x = x
 67        self.target = target
 68
 69        # ๋‹จ๊ณ„๋ณ„ ๊ณ„์‚ฐ (์บ์‹œ์— ์ €์žฅ)
 70        self.z = self.w * x + self.b      # ์„ ํ˜• ๋ณ€ํ™˜
 71        self.a = sigmoid(self.z)           # ํ™œ์„ฑํ™”
 72        self.loss = (self.a - target) ** 2 # MSE
 73
 74        return self.a, self.loss
 75
 76    def backward(self):
 77        """
 78        ์—ญ์ „ํŒŒ: ์ฒด์ธ ๋ฃฐ ์ ์šฉ
 79
 80        dL/dw = (dL/da) * (da/dz) * (dz/dw)
 81        dL/db = (dL/da) * (da/dz) * (dz/db)
 82        """
 83        # 1. ์†์‹ค โ†’ ํ™œ์„ฑํ™”
 84        dL_da = 2 * (self.a - self.target)
 85
 86        # 2. ํ™œ์„ฑํ™” โ†’ ์„ ํ˜• (์‹œ๊ทธ๋ชจ์ด๋“œ ๋ฏธ๋ถ„)
 87        da_dz = sigmoid_derivative(self.z)
 88
 89        # 3. ์„ ํ˜• โ†’ ๊ฐ€์ค‘์น˜/ํŽธํ–ฅ
 90        dz_dw = self.x
 91        dz_db = 1
 92
 93        # ์ฒด์ธ ๋ฃฐ ์ ์šฉ
 94        dL_dw = dL_da * da_dz * dz_dw
 95        dL_db = dL_da * da_dz * dz_db
 96
 97        return dL_dw, dL_db
 98
 99# ํ…Œ์ŠคํŠธ
100neuron = SingleNeuron()
101x, target = 2.0, 1.0
102
103print(f"์ž…๋ ฅ: x={x}, target={target}")
104print(f"์ดˆ๊ธฐ ๊ฐ€์ค‘์น˜: w={neuron.w:.4f}, b={neuron.b:.4f}")
105
106pred, loss = neuron.forward(x, target)
107print(f"์˜ˆ์ธก: {pred:.4f}, ์†์‹ค: {loss:.4f}")
108
109dw, db = neuron.backward()
110print(f"๊ธฐ์šธ๊ธฐ: dL/dw={dw:.4f}, dL/db={db:.4f}")
111
112
113# ============================================
114# 3. 2์ธต MLP ์—ญ์ „ํŒŒ (ํ•ต์‹ฌ!)
115# ============================================
116print("\n[3] 2์ธต MLP ์—ญ์ „ํŒŒ")
117print("-" * 40)
118
119class MLPFromScratch:
120    """
121    2์ธต MLP with ์—ญ์ „ํŒŒ
122
123    ๊ตฌ์กฐ: ์ž…๋ ฅ โ†’ [W1, b1] โ†’ ReLU โ†’ [W2, b2] โ†’ Sigmoid โ†’ ์ถœ๋ ฅ
124    """
125    def __init__(self, input_dim, hidden_dim, output_dim):
126        # Xavier ์ดˆ๊ธฐํ™”
127        self.W1 = np.random.randn(input_dim, hidden_dim) * np.sqrt(2.0 / input_dim)
128        self.b1 = np.zeros(hidden_dim)
129        self.W2 = np.random.randn(hidden_dim, output_dim) * np.sqrt(2.0 / hidden_dim)
130        self.b2 = np.zeros(output_dim)
131
132        print(f"MLP ์ƒ์„ฑ: {input_dim} โ†’ {hidden_dim} โ†’ {output_dim}")
133
134    def forward(self, X):
135        """์ˆœ์ „ํŒŒ (์ค‘๊ฐ„๊ฐ’ ์บ์‹œ)"""
136        # ์ฒซ ๋ฒˆ์งธ ์ธต
137        self.z1 = X @ self.W1 + self.b1
138        self.a1 = relu(self.z1)
139
140        # ๋‘ ๋ฒˆ์งธ ์ธต
141        self.z2 = self.a1 @ self.W2 + self.b2
142        self.a2 = sigmoid(self.z2)
143
144        return self.a2
145
146    def backward(self, X, y_true):
147        """
148        ์—ญ์ „ํŒŒ: ์ฒด์ธ ๋ฃฐ๋กœ ๋ชจ๋“  ๊ธฐ์šธ๊ธฐ ๊ณ„์‚ฐ
149
150        ํ•ต์‹ฌ ๊ณต์‹:
151        dL/dW2 = a1.T @ (dL/dz2)
152        dL/dW1 = X.T @ (dL/dz1)
153        """
154        m = X.shape[0]  # ๋ฐฐ์น˜ ํฌ๊ธฐ
155
156        # ===== ์ถœ๋ ฅ์ธต ์—ญ์ „ํŒŒ =====
157        # dL/da2 = 2(a2 - y) for MSE
158        dL_da2 = 2 * (self.a2 - y_true) / m
159
160        # dL/dz2 = dL/da2 * sigmoid'(z2)
161        dL_dz2 = dL_da2 * sigmoid_derivative(self.z2)
162
163        # dL/dW2 = a1.T @ dL/dz2
164        dW2 = self.a1.T @ dL_dz2
165        db2 = np.sum(dL_dz2, axis=0)
166
167        # ===== ์€๋‹‰์ธต ์—ญ์ „ํŒŒ =====
168        # dL/da1 = dL/dz2 @ W2.T (๊ธฐ์šธ๊ธฐ ์—ญ์ „ํŒŒ)
169        dL_da1 = dL_dz2 @ self.W2.T
170
171        # dL/dz1 = dL/da1 * relu'(z1)
172        dL_dz1 = dL_da1 * relu_derivative(self.z1)
173
174        # dL/dW1 = X.T @ dL/dz1
175        dW1 = X.T @ dL_dz1
176        db1 = np.sum(dL_dz1, axis=0)
177
178        return {'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2}
179
180    def update(self, grads, lr):
181        """๊ฒฝ์‚ฌ ํ•˜๊ฐ•๋ฒ•์œผ๋กœ ๊ฐ€์ค‘์น˜ ์—…๋ฐ์ดํŠธ"""
182        self.W1 -= lr * grads['W1']
183        self.b1 -= lr * grads['b1']
184        self.W2 -= lr * grads['W2']
185        self.b2 -= lr * grads['b2']
186
187    def loss(self, y_pred, y_true):
188        """MSE ์†์‹ค"""
189        return np.mean((y_pred - y_true) ** 2)
190
191
192# ============================================
193# 4. XOR ๋ฌธ์ œ๋กœ ํ…Œ์ŠคํŠธ
194# ============================================
195print("\n[4] XOR ๋ฌธ์ œ ํ•™์Šต")
196print("-" * 40)
197
198# ๋ฐ์ดํ„ฐ
199X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float64)
200y = np.array([[0], [1], [1], [0]], dtype=np.float64)
201
202# ๋ชจ๋ธ ์ƒ์„ฑ
203np.random.seed(42)
204mlp = MLPFromScratch(input_dim=2, hidden_dim=8, output_dim=1)
205
206# ํ•™์Šต
207learning_rate = 1.0
208epochs = 2000
209losses = []
210
211for epoch in range(epochs):
212    # ์ˆœ์ „ํŒŒ
213    y_pred = mlp.forward(X)
214    loss = mlp.loss(y_pred, y)
215    losses.append(loss)
216
217    # ์—ญ์ „ํŒŒ
218    grads = mlp.backward(X, y)
219
220    # ๊ฐ€์ค‘์น˜ ์—…๋ฐ์ดํŠธ
221    mlp.update(grads, learning_rate)
222
223    if (epoch + 1) % 400 == 0:
224        print(f"Epoch {epoch+1}: Loss = {loss:.6f}")
225
226# ๊ฒฐ๊ณผ ํ™•์ธ
227print("\nํ•™์Šต ๊ฒฐ๊ณผ:")
228y_final = mlp.forward(X)
229for i in range(4):
230    print(f"  {X[i]} โ†’ {y_final[i, 0]:.4f} (์ •๋‹ต: {y[i, 0]})")
231
232# ์†์‹ค ๊ทธ๋ž˜ํ”„
233plt.figure(figsize=(10, 5))
234plt.plot(losses)
235plt.xlabel('Epoch')
236plt.ylabel('Loss')
237plt.title('XOR Training Loss (NumPy Backprop)')
238plt.yscale('log')
239plt.grid(True, alpha=0.3)
240plt.savefig('numpy_xor_loss.png', dpi=100)
241plt.close()
242print("\n์†์‹ค ๊ทธ๋ž˜ํ”„ ์ €์žฅ: numpy_xor_loss.png")
243
244
245# ============================================
246# 5. ๊ธฐ์šธ๊ธฐ ๊ฒ€์ฆ (Gradient Checking)
247# ============================================
248print("\n[5] ๊ธฐ์šธ๊ธฐ ๊ฒ€์ฆ")
249print("-" * 40)
250
251def numerical_gradient(model, X, y, param_name, h=1e-5):
252    """์ˆ˜์น˜ ๋ฏธ๋ถ„์œผ๋กœ ๊ธฐ์šธ๊ธฐ ๊ณ„์‚ฐ"""
253    param = getattr(model, param_name)
254    grad = np.zeros_like(param)
255
256    it = np.nditer(param, flags=['multi_index'])
257    while not it.finished:
258        idx = it.multi_index
259        original = param[idx]
260
261        # f(x + h)
262        param[idx] = original + h
263        loss_plus = model.loss(model.forward(X), y)
264
265        # f(x - h)
266        param[idx] = original - h
267        loss_minus = model.loss(model.forward(X), y)
268
269        # ์ˆ˜์น˜ ๋ฏธ๋ถ„
270        grad[idx] = (loss_plus - loss_minus) / (2 * h)
271
272        param[idx] = original
273        it.iternext()
274
275    return grad
276
277# ์ž‘์€ ๋ชจ๋ธ๋กœ ํ…Œ์ŠคํŠธ
278np.random.seed(0)
279small_mlp = MLPFromScratch(2, 4, 1)
280
281# ์ˆœ์ „ํŒŒ
282y_pred = small_mlp.forward(X)
283
284# ํ•ด์„์  ๊ธฐ์šธ๊ธฐ (์—ญ์ „ํŒŒ)
285analytical_grads = small_mlp.backward(X, y)
286
287# ์ˆ˜์น˜์  ๊ธฐ์šธ๊ธฐ
288numerical_W1 = numerical_gradient(small_mlp, X, y, 'W1')
289numerical_W2 = numerical_gradient(small_mlp, X, y, 'W2')
290
291# ๋น„๊ต
292diff_W1 = np.linalg.norm(analytical_grads['W1'] - numerical_W1)
293diff_W2 = np.linalg.norm(analytical_grads['W2'] - numerical_W2)
294
295print(f"W1 ๊ธฐ์šธ๊ธฐ ์ฐจ์ด: {diff_W1:.2e}")
296print(f"W2 ๊ธฐ์šธ๊ธฐ ์ฐจ์ด: {diff_W2:.2e}")
297
298if diff_W1 < 1e-5 and diff_W2 < 1e-5:
299    print("โœ“ ๊ธฐ์šธ๊ธฐ ๊ฒ€์ฆ ํ†ต๊ณผ!")
300else:
301    print("โœ— ๊ธฐ์šธ๊ธฐ ๊ฒ€์ฆ ์‹คํŒจ")
302
303
304# ============================================
305# 6. ์ฒด์ธ ๋ฃฐ ์‹œ๊ฐํ™”
306# ============================================
307print("\n[6] ์ฒด์ธ ๋ฃฐ ํ๋ฆ„")
308print("-" * 40)
309
310chain_rule_diagram = """
311์ˆœ์ „ํŒŒ (Forward):
312    x โ”€โ”€โ–ถ z1=xW1+b1 โ”€โ”€โ–ถ a1=relu(z1) โ”€โ”€โ–ถ z2=a1W2+b2 โ”€โ”€โ–ถ a2=ฯƒ(z2) โ”€โ”€โ–ถ L=MSE
313
314์—ญ์ „ํŒŒ (Backward):
315    dL/dW1 โ—€โ”€โ”€ dL/dz1 โ—€โ”€โ”€ dL/da1 โ—€โ”€โ”€ dL/dz2 โ—€โ”€โ”€ dL/da2 โ—€โ”€โ”€ dL/dL=1
316
317์ฒด์ธ ๋ฃฐ ์ ์šฉ:
318    dL/dW2 = (dL/da2) ร— (da2/dz2) ร— (dz2/dW2)
319           = 2(a2-y) ร— ฯƒ'(z2) ร— a1.T
320
321    dL/dW1 = (dL/da2) ร— (da2/dz2) ร— (dz2/da1) ร— (da1/dz1) ร— (dz1/dW1)
322           = 2(a2-y) ร— ฯƒ'(z2) ร— W2.T ร— relu'(z1) ร— x.T
323"""
324print(chain_rule_diagram)
325
326
327# ============================================
328# ์ •๋ฆฌ
329# ============================================
330print("\n" + "=" * 60)
331print("์—ญ์ „ํŒŒ ํ•ต์‹ฌ ์ •๋ฆฌ")
332print("=" * 60)
333
334summary = """
3351. ์ˆœ์ „ํŒŒ: ์ž…๋ ฅ โ†’ ์ถœ๋ ฅ ๋ฐฉํ–ฅ์œผ๋กœ ๊ฐ’ ๊ณ„์‚ฐ
3362. ์†์‹ค ๊ณ„์‚ฐ: ์˜ˆ์ธก๊ณผ ์ •๋‹ต์˜ ์ฐจ์ด
3373. ์—ญ์ „ํŒŒ: ์ถœ๋ ฅ โ†’ ์ž…๋ ฅ ๋ฐฉํ–ฅ์œผ๋กœ ๊ธฐ์šธ๊ธฐ ๊ณ„์‚ฐ (์ฒด์ธ ๋ฃฐ)
3384. ์—…๋ฐ์ดํŠธ: W = W - lr ร— (dL/dW)
339
340ํ•ต์‹ฌ ๊ณต์‹:
341- ์ถœ๋ ฅ์ธต: dL/dz2 = dL/da2 ร— ฯƒ'(z2)
342- ์€๋‹‰์ธต: dL/dz1 = (dL/dz2 @ W2.T) ร— relu'(z1)
343- ๊ฐ€์ค‘์น˜: dL/dW = ์ด์ „์ธต์ถœ๋ ฅ.T @ ํ˜„์žฌ์ธต๊ธฐ์šธ๊ธฐ
344
345PyTorch์—์„œ๋Š”:
346    loss.backward()  # ์ด ํ•œ ์ค„์ด ์œ„์˜ ๋ชจ๋“  ๊ณผ์ •์„ ์ž๋™ ์ˆ˜ํ–‰!
347
348NumPy ๊ตฌํ˜„์˜ ๊ฐ€์น˜:
3491. ํ–‰๋ ฌ ๊ณฑ์…ˆ์˜ ์ „์น˜ ๋ฐฉํ–ฅ ์ดํ•ด
3502. ํ™œ์„ฑํ™” ํ•จ์ˆ˜ ๋ฏธ๋ถ„์˜ ์—ญํ•  ์ดํ•ด
3513. ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ์—์„œ ํ•ฉ์‚ฐ์ด ํ•„์š”ํ•œ ์ด์œ  ์ดํ•ด
352"""
353print(summary)
354print("=" * 60)