1"""
203. ์ญ์ ํ (Backpropagation) - NumPy ๋ฒ์
3
4NumPy๋ก ์ญ์ ํ๋ฅผ ์ง์ ๊ตฌํํ์ฌ ์๋ฆฌ๋ฅผ ์ดํดํฉ๋๋ค.
5์ด ํ์ผ์ด ๋ฅ๋ฌ๋ ์ดํด์ ํต์ฌ์
๋๋ค!
6
7PyTorch์์๋ loss.backward() ํ ์ค์ด์ง๋ง,
8์ฌ๊ธฐ์๋ ์ฒด์ธ ๋ฃฐ์ ์ง์ ์ ์ฉํฉ๋๋ค.
9"""
10
11import numpy as np
12import matplotlib.pyplot as plt
13
14print("=" * 60)
15print("NumPy ์ญ์ ํ (Backpropagation) from scratch")
16print("=" * 60)
17
18
19# ============================================
20# 1. ํ์ฑํ ํจ์์ ๊ทธ ๋ฏธ๋ถ
21# ============================================
22print("\n[1] ํ์ฑํ ํจ์์ ๋ฏธ๋ถ")
23print("-" * 40)
24
25def sigmoid(x):
26 return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
27
28def sigmoid_derivative(x):
29 """sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x))"""
30 s = sigmoid(x)
31 return s * (1 - s)
32
33def relu(x):
34 return np.maximum(0, x)
35
36def relu_derivative(x):
37 """relu'(x) = 1 if x > 0 else 0"""
38 return (x > 0).astype(float)
39
40# ํ
์คํธ
41x = np.array([-2, -1, 0, 1, 2])
42print(f"x: {x}")
43print(f"sigmoid(x): {sigmoid(x).round(4)}")
44print(f"sigmoid'(x): {sigmoid_derivative(x).round(4)}")
45print(f"relu(x): {relu(x)}")
46print(f"relu'(x): {relu_derivative(x)}")
47
48
49# ============================================
50# 2. ๋จ์ผ ๋ด๋ฐ ์ญ์ ํ (์ดํด์ฉ)
51# ============================================
52print("\n[2] ๋จ์ผ ๋ด๋ฐ ์ญ์ ํ")
53print("-" * 40)
54
55class SingleNeuron:
56 """
57 ๋จ์ผ ๋ด๋ฐ: y = sigmoid(w*x + b)
58 ์์ค: L = (y - target)^2
59 """
60 def __init__(self):
61 self.w = np.random.randn()
62 self.b = np.random.randn()
63
64 def forward(self, x, target):
65 """์์ ํ"""
66 self.x = x
67 self.target = target
68
69 # ๋จ๊ณ๋ณ ๊ณ์ฐ (์บ์์ ์ ์ฅ)
70 self.z = self.w * x + self.b # ์ ํ ๋ณํ
71 self.a = sigmoid(self.z) # ํ์ฑํ
72 self.loss = (self.a - target) ** 2 # MSE
73
74 return self.a, self.loss
75
76 def backward(self):
77 """
78 ์ญ์ ํ: ์ฒด์ธ ๋ฃฐ ์ ์ฉ
79
80 dL/dw = (dL/da) * (da/dz) * (dz/dw)
81 dL/db = (dL/da) * (da/dz) * (dz/db)
82 """
83 # 1. ์์ค โ ํ์ฑํ
84 dL_da = 2 * (self.a - self.target)
85
86 # 2. ํ์ฑํ โ ์ ํ (์๊ทธ๋ชจ์ด๋ ๋ฏธ๋ถ)
87 da_dz = sigmoid_derivative(self.z)
88
89 # 3. ์ ํ โ ๊ฐ์ค์น/ํธํฅ
90 dz_dw = self.x
91 dz_db = 1
92
93 # ์ฒด์ธ ๋ฃฐ ์ ์ฉ
94 dL_dw = dL_da * da_dz * dz_dw
95 dL_db = dL_da * da_dz * dz_db
96
97 return dL_dw, dL_db
98
99# ํ
์คํธ
100neuron = SingleNeuron()
101x, target = 2.0, 1.0
102
103print(f"์
๋ ฅ: x={x}, target={target}")
104print(f"์ด๊ธฐ ๊ฐ์ค์น: w={neuron.w:.4f}, b={neuron.b:.4f}")
105
106pred, loss = neuron.forward(x, target)
107print(f"์์ธก: {pred:.4f}, ์์ค: {loss:.4f}")
108
109dw, db = neuron.backward()
110print(f"๊ธฐ์ธ๊ธฐ: dL/dw={dw:.4f}, dL/db={db:.4f}")
111
112
113# ============================================
114# 3. 2์ธต MLP ์ญ์ ํ (ํต์ฌ!)
115# ============================================
116print("\n[3] 2์ธต MLP ์ญ์ ํ")
117print("-" * 40)
118
119class MLPFromScratch:
120 """
121 2์ธต MLP with ์ญ์ ํ
122
123 ๊ตฌ์กฐ: ์
๋ ฅ โ [W1, b1] โ ReLU โ [W2, b2] โ Sigmoid โ ์ถ๋ ฅ
124 """
125 def __init__(self, input_dim, hidden_dim, output_dim):
126 # Xavier ์ด๊ธฐํ
127 self.W1 = np.random.randn(input_dim, hidden_dim) * np.sqrt(2.0 / input_dim)
128 self.b1 = np.zeros(hidden_dim)
129 self.W2 = np.random.randn(hidden_dim, output_dim) * np.sqrt(2.0 / hidden_dim)
130 self.b2 = np.zeros(output_dim)
131
132 print(f"MLP ์์ฑ: {input_dim} โ {hidden_dim} โ {output_dim}")
133
134 def forward(self, X):
135 """์์ ํ (์ค๊ฐ๊ฐ ์บ์)"""
136 # ์ฒซ ๋ฒ์งธ ์ธต
137 self.z1 = X @ self.W1 + self.b1
138 self.a1 = relu(self.z1)
139
140 # ๋ ๋ฒ์งธ ์ธต
141 self.z2 = self.a1 @ self.W2 + self.b2
142 self.a2 = sigmoid(self.z2)
143
144 return self.a2
145
146 def backward(self, X, y_true):
147 """
148 ์ญ์ ํ: ์ฒด์ธ ๋ฃฐ๋ก ๋ชจ๋ ๊ธฐ์ธ๊ธฐ ๊ณ์ฐ
149
150 ํต์ฌ ๊ณต์:
151 dL/dW2 = a1.T @ (dL/dz2)
152 dL/dW1 = X.T @ (dL/dz1)
153 """
154 m = X.shape[0] # ๋ฐฐ์น ํฌ๊ธฐ
155
156 # ===== ์ถ๋ ฅ์ธต ์ญ์ ํ =====
157 # dL/da2 = 2(a2 - y) for MSE
158 dL_da2 = 2 * (self.a2 - y_true) / m
159
160 # dL/dz2 = dL/da2 * sigmoid'(z2)
161 dL_dz2 = dL_da2 * sigmoid_derivative(self.z2)
162
163 # dL/dW2 = a1.T @ dL/dz2
164 dW2 = self.a1.T @ dL_dz2
165 db2 = np.sum(dL_dz2, axis=0)
166
167 # ===== ์๋์ธต ์ญ์ ํ =====
168 # dL/da1 = dL/dz2 @ W2.T (๊ธฐ์ธ๊ธฐ ์ญ์ ํ)
169 dL_da1 = dL_dz2 @ self.W2.T
170
171 # dL/dz1 = dL/da1 * relu'(z1)
172 dL_dz1 = dL_da1 * relu_derivative(self.z1)
173
174 # dL/dW1 = X.T @ dL/dz1
175 dW1 = X.T @ dL_dz1
176 db1 = np.sum(dL_dz1, axis=0)
177
178 return {'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2}
179
180 def update(self, grads, lr):
181 """๊ฒฝ์ฌ ํ๊ฐ๋ฒ์ผ๋ก ๊ฐ์ค์น ์
๋ฐ์ดํธ"""
182 self.W1 -= lr * grads['W1']
183 self.b1 -= lr * grads['b1']
184 self.W2 -= lr * grads['W2']
185 self.b2 -= lr * grads['b2']
186
187 def loss(self, y_pred, y_true):
188 """MSE ์์ค"""
189 return np.mean((y_pred - y_true) ** 2)
190
191
192# ============================================
193# 4. XOR ๋ฌธ์ ๋ก ํ
์คํธ
194# ============================================
195print("\n[4] XOR ๋ฌธ์ ํ์ต")
196print("-" * 40)
197
198# ๋ฐ์ดํฐ
199X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float64)
200y = np.array([[0], [1], [1], [0]], dtype=np.float64)
201
202# ๋ชจ๋ธ ์์ฑ
203np.random.seed(42)
204mlp = MLPFromScratch(input_dim=2, hidden_dim=8, output_dim=1)
205
206# ํ์ต
207learning_rate = 1.0
208epochs = 2000
209losses = []
210
211for epoch in range(epochs):
212 # ์์ ํ
213 y_pred = mlp.forward(X)
214 loss = mlp.loss(y_pred, y)
215 losses.append(loss)
216
217 # ์ญ์ ํ
218 grads = mlp.backward(X, y)
219
220 # ๊ฐ์ค์น ์
๋ฐ์ดํธ
221 mlp.update(grads, learning_rate)
222
223 if (epoch + 1) % 400 == 0:
224 print(f"Epoch {epoch+1}: Loss = {loss:.6f}")
225
226# ๊ฒฐ๊ณผ ํ์ธ
227print("\nํ์ต ๊ฒฐ๊ณผ:")
228y_final = mlp.forward(X)
229for i in range(4):
230 print(f" {X[i]} โ {y_final[i, 0]:.4f} (์ ๋ต: {y[i, 0]})")
231
232# ์์ค ๊ทธ๋ํ
233plt.figure(figsize=(10, 5))
234plt.plot(losses)
235plt.xlabel('Epoch')
236plt.ylabel('Loss')
237plt.title('XOR Training Loss (NumPy Backprop)')
238plt.yscale('log')
239plt.grid(True, alpha=0.3)
240plt.savefig('numpy_xor_loss.png', dpi=100)
241plt.close()
242print("\n์์ค ๊ทธ๋ํ ์ ์ฅ: numpy_xor_loss.png")
243
244
245# ============================================
246# 5. ๊ธฐ์ธ๊ธฐ ๊ฒ์ฆ (Gradient Checking)
247# ============================================
248print("\n[5] ๊ธฐ์ธ๊ธฐ ๊ฒ์ฆ")
249print("-" * 40)
250
251def numerical_gradient(model, X, y, param_name, h=1e-5):
252 """์์น ๋ฏธ๋ถ์ผ๋ก ๊ธฐ์ธ๊ธฐ ๊ณ์ฐ"""
253 param = getattr(model, param_name)
254 grad = np.zeros_like(param)
255
256 it = np.nditer(param, flags=['multi_index'])
257 while not it.finished:
258 idx = it.multi_index
259 original = param[idx]
260
261 # f(x + h)
262 param[idx] = original + h
263 loss_plus = model.loss(model.forward(X), y)
264
265 # f(x - h)
266 param[idx] = original - h
267 loss_minus = model.loss(model.forward(X), y)
268
269 # ์์น ๋ฏธ๋ถ
270 grad[idx] = (loss_plus - loss_minus) / (2 * h)
271
272 param[idx] = original
273 it.iternext()
274
275 return grad
276
277# ์์ ๋ชจ๋ธ๋ก ํ
์คํธ
278np.random.seed(0)
279small_mlp = MLPFromScratch(2, 4, 1)
280
281# ์์ ํ
282y_pred = small_mlp.forward(X)
283
284# ํด์์ ๊ธฐ์ธ๊ธฐ (์ญ์ ํ)
285analytical_grads = small_mlp.backward(X, y)
286
287# ์์น์ ๊ธฐ์ธ๊ธฐ
288numerical_W1 = numerical_gradient(small_mlp, X, y, 'W1')
289numerical_W2 = numerical_gradient(small_mlp, X, y, 'W2')
290
291# ๋น๊ต
292diff_W1 = np.linalg.norm(analytical_grads['W1'] - numerical_W1)
293diff_W2 = np.linalg.norm(analytical_grads['W2'] - numerical_W2)
294
295print(f"W1 ๊ธฐ์ธ๊ธฐ ์ฐจ์ด: {diff_W1:.2e}")
296print(f"W2 ๊ธฐ์ธ๊ธฐ ์ฐจ์ด: {diff_W2:.2e}")
297
298if diff_W1 < 1e-5 and diff_W2 < 1e-5:
299 print("โ ๊ธฐ์ธ๊ธฐ ๊ฒ์ฆ ํต๊ณผ!")
300else:
301 print("โ ๊ธฐ์ธ๊ธฐ ๊ฒ์ฆ ์คํจ")
302
303
304# ============================================
305# 6. ์ฒด์ธ ๋ฃฐ ์๊ฐํ
306# ============================================
307print("\n[6] ์ฒด์ธ ๋ฃฐ ํ๋ฆ")
308print("-" * 40)
309
310chain_rule_diagram = """
311์์ ํ (Forward):
312 x โโโถ z1=xW1+b1 โโโถ a1=relu(z1) โโโถ z2=a1W2+b2 โโโถ a2=ฯ(z2) โโโถ L=MSE
313
314์ญ์ ํ (Backward):
315 dL/dW1 โโโ dL/dz1 โโโ dL/da1 โโโ dL/dz2 โโโ dL/da2 โโโ dL/dL=1
316
317์ฒด์ธ ๋ฃฐ ์ ์ฉ:
318 dL/dW2 = (dL/da2) ร (da2/dz2) ร (dz2/dW2)
319 = 2(a2-y) ร ฯ'(z2) ร a1.T
320
321 dL/dW1 = (dL/da2) ร (da2/dz2) ร (dz2/da1) ร (da1/dz1) ร (dz1/dW1)
322 = 2(a2-y) ร ฯ'(z2) ร W2.T ร relu'(z1) ร x.T
323"""
324print(chain_rule_diagram)
325
326
327# ============================================
328# ์ ๋ฆฌ
329# ============================================
330print("\n" + "=" * 60)
331print("์ญ์ ํ ํต์ฌ ์ ๋ฆฌ")
332print("=" * 60)
333
334summary = """
3351. ์์ ํ: ์
๋ ฅ โ ์ถ๋ ฅ ๋ฐฉํฅ์ผ๋ก ๊ฐ ๊ณ์ฐ
3362. ์์ค ๊ณ์ฐ: ์์ธก๊ณผ ์ ๋ต์ ์ฐจ์ด
3373. ์ญ์ ํ: ์ถ๋ ฅ โ ์
๋ ฅ ๋ฐฉํฅ์ผ๋ก ๊ธฐ์ธ๊ธฐ ๊ณ์ฐ (์ฒด์ธ ๋ฃฐ)
3384. ์
๋ฐ์ดํธ: W = W - lr ร (dL/dW)
339
340ํต์ฌ ๊ณต์:
341- ์ถ๋ ฅ์ธต: dL/dz2 = dL/da2 ร ฯ'(z2)
342- ์๋์ธต: dL/dz1 = (dL/dz2 @ W2.T) ร relu'(z1)
343- ๊ฐ์ค์น: dL/dW = ์ด์ ์ธต์ถ๋ ฅ.T @ ํ์ฌ์ธต๊ธฐ์ธ๊ธฐ
344
345PyTorch์์๋:
346 loss.backward() # ์ด ํ ์ค์ด ์์ ๋ชจ๋ ๊ณผ์ ์ ์๋ ์ํ!
347
348NumPy ๊ตฌํ์ ๊ฐ์น:
3491. ํ๋ ฌ ๊ณฑ์
์ ์ ์น ๋ฐฉํฅ ์ดํด
3502. ํ์ฑํ ํจ์ ๋ฏธ๋ถ์ ์ญํ ์ดํด
3513. ๋ฐฐ์น ์ฒ๋ฆฌ์์ ํฉ์ฐ์ด ํ์ํ ์ด์ ์ดํด
352"""
353print(summary)
354print("=" * 60)