1"""
2Multi-Layer Perceptron - NumPy From-Scratch ๊ตฌํ
3
4์ด ํ์ผ์ MLP๋ฅผ ์์ NumPy๋ก ๊ตฌํํฉ๋๋ค.
5Backpropagation ์๊ณ ๋ฆฌ์ฆ์ ์ง์ ๊ตฌํํ์ฌ
6๋ฅ๋ฌ๋์ ํต์ฌ ์๋ฆฌ๋ฅผ ์ดํดํฉ๋๋ค.
7
8ํ์ต ๋ชฉํ:
91. Forward pass: ๋ค์ธต ์ ๊ฒฝ๋ง์ ์์ ํ
102. Backward pass: Chain rule์ ์ด์ฉํ ์ญ์ ํ
113. Activation functions: ReLU, Sigmoid, Tanh
124. Weight initialization: Xavier, He ์ด๊ธฐํ
13"""
14
15import numpy as np
16
17
18class ActivationFunctions:
19 """ํ์ฑํ ํจ์์ ๊ทธ ๋ฏธ๋ถ"""
20
21 @staticmethod
22 def relu(z):
23 return np.maximum(0, z)
24
25 @staticmethod
26 def relu_derivative(z):
27 return (z > 0).astype(float)
28
29 @staticmethod
30 def sigmoid(z):
31 # ์์น ์์ ์ฑ์ ์ํด ํด๋ฆฌํ
32 z = np.clip(z, -500, 500)
33 return 1 / (1 + np.exp(-z))
34
35 @staticmethod
36 def sigmoid_derivative(z):
37 s = ActivationFunctions.sigmoid(z)
38 return s * (1 - s)
39
40 @staticmethod
41 def tanh(z):
42 return np.tanh(z)
43
44 @staticmethod
45 def tanh_derivative(z):
46 return 1 - np.tanh(z) ** 2
47
48 @staticmethod
49 def softmax(z):
50 # ์์น ์์ ์ฑ: ์ต๋๊ฐ์ ๋นผ์ค
51 exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
52 return exp_z / np.sum(exp_z, axis=1, keepdims=True)
53
54
55class Layer:
56 """
57 ๋จ์ผ Fully Connected Layer
58
59 z = Wx + b (์ ํ ๋ณํ)
60 a = ฯ(z) (ํ์ฑํ)
61 """
62
63 def __init__(self, input_dim: int, output_dim: int, activation: str = 'relu'):
64 """
65 Args:
66 input_dim: ์
๋ ฅ ์ฐจ์
67 output_dim: ์ถ๋ ฅ ์ฐจ์
68 activation: 'relu', 'sigmoid', 'tanh', 'none'
69 """
70 # He ์ด๊ธฐํ (ReLU์ฉ)
71 if activation == 'relu':
72 self.W = np.random.randn(input_dim, output_dim) * np.sqrt(2.0 / input_dim)
73 else:
74 # Xavier ์ด๊ธฐํ
75 self.W = np.random.randn(input_dim, output_dim) * np.sqrt(1.0 / input_dim)
76
77 self.b = np.zeros((1, output_dim))
78
79 self.activation = activation
80 self._get_activation_fn()
81
82 # Gradients
83 self.dW = None
84 self.db = None
85
86 # Cache (for backward)
87 self.cache = {}
88
89 def _get_activation_fn(self):
90 """ํ์ฑํ ํจ์ ์ค์ """
91 activations = {
92 'relu': (ActivationFunctions.relu, ActivationFunctions.relu_derivative),
93 'sigmoid': (ActivationFunctions.sigmoid, ActivationFunctions.sigmoid_derivative),
94 'tanh': (ActivationFunctions.tanh, ActivationFunctions.tanh_derivative),
95 'none': (lambda x: x, lambda x: np.ones_like(x)),
96 }
97 self.act_fn, self.act_derivative = activations[self.activation]
98
99 def forward(self, x: np.ndarray) -> np.ndarray:
100 """
101 Forward pass
102
103 Args:
104 x: ์
๋ ฅ (batch_size, input_dim)
105
106 Returns:
107 a: ํ์ฑํ ์ถ๋ ฅ (batch_size, output_dim)
108 """
109 # ์บ์ ์ ์ฅ (backward์์ ์ฌ์ฉ)
110 self.cache['x'] = x
111
112 # ์ ํ ๋ณํ: z = Wx + b
113 z = np.dot(x, self.W) + self.b
114 self.cache['z'] = z
115
116 # ํ์ฑํ: a = ฯ(z)
117 a = self.act_fn(z)
118 self.cache['a'] = a
119
120 return a
121
122 def backward(self, da: np.ndarray) -> np.ndarray:
123 """
124 Backward pass
125
126 Args:
127 da: ์ถ๋ ฅ์ gradient (batch_size, output_dim)
128
129 Returns:
130 dx: ์
๋ ฅ์ gradient (batch_size, input_dim)
131 """
132 x = self.cache['x']
133 z = self.cache['z']
134 batch_size = x.shape[0]
135
136 # โL/โz = โL/โa ร โa/โz = da ร ฯ'(z)
137 dz = da * self.act_derivative(z)
138
139 # โL/โW = x^T ร โL/โz
140 self.dW = np.dot(x.T, dz) / batch_size
141
142 # โL/โb = sum(โL/โz)
143 self.db = np.sum(dz, axis=0, keepdims=True) / batch_size
144
145 # โL/โx = โL/โz ร W^T (๋ค์ ๋ ์ด์ด๋ก ์ ํ)
146 dx = np.dot(dz, self.W.T)
147
148 return dx
149
150
151class MLPNumpy:
152 """
153 Multi-Layer Perceptron (NumPy ๊ตฌํ)
154
155 ์ฌ์ฉ ์:
156 model = MLPNumpy([784, 256, 128, 10], activations=['relu', 'relu', 'none'])
157 model.fit(X_train, y_train)
158 predictions = model.predict(X_test)
159 """
160
161 def __init__(self, layer_dims: list, activations: list = None):
162 """
163 Args:
164 layer_dims: ๊ฐ ๋ ์ด์ด์ ์ฐจ์ [input, hidden1, hidden2, ..., output]
165 activations: ๊ฐ ๋ ์ด์ด์ ํ์ฑํ ํจ์ (๋ง์ง๋ง ๋ ์ด์ด ์ ์ธ)
166 """
167 self.layers = []
168 n_layers = len(layer_dims) - 1
169
170 if activations is None:
171 activations = ['relu'] * (n_layers - 1) + ['none']
172
173 for i in range(n_layers):
174 layer = Layer(layer_dims[i], layer_dims[i + 1], activations[i])
175 self.layers.append(layer)
176
177 def forward(self, x: np.ndarray) -> np.ndarray:
178 """์ ์ฒด ๋คํธ์ํฌ forward pass"""
179 for layer in self.layers:
180 x = layer.forward(x)
181 return x
182
183 def backward(self, loss_grad: np.ndarray) -> None:
184 """์ ์ฒด ๋คํธ์ํฌ backward pass"""
185 grad = loss_grad
186 for layer in reversed(self.layers):
187 grad = layer.backward(grad)
188
189 def compute_loss(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
190 """
191 Cross-entropy loss (๋ถ๋ฅ์ฉ)
192
193 L = -1/n ร ฮฃ y_true ร log(y_pred)
194 """
195 eps = 1e-15 # ์์น ์์ ์ฑ
196 y_pred = np.clip(y_pred, eps, 1 - eps)
197
198 if y_true.ndim == 1:
199 # Sparse labels โ one-hot
200 n_classes = y_pred.shape[1]
201 y_true_onehot = np.zeros((len(y_true), n_classes))
202 y_true_onehot[np.arange(len(y_true)), y_true] = 1
203 y_true = y_true_onehot
204
205 loss = -np.mean(np.sum(y_true * np.log(y_pred), axis=1))
206 return loss
207
208 def compute_loss_gradient(self, y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
209 """
210 Cross-entropy gradient (softmax ์ถ๋ ฅ ๊ฐ์ )
211
212 โL/โz = y_pred - y_true (softmax + CE์ ๊ฒฝ์ฐ ๊ฐ๋จํด์ง)
213 """
214 if y_true.ndim == 1:
215 n_classes = y_pred.shape[1]
216 y_true_onehot = np.zeros((len(y_true), n_classes))
217 y_true_onehot[np.arange(len(y_true)), y_true] = 1
218 y_true = y_true_onehot
219
220 return y_pred - y_true
221
222 def update_weights(self, lr: float) -> None:
223 """SGD ๊ฐ์ค์น ์
๋ฐ์ดํธ"""
224 for layer in self.layers:
225 layer.W -= lr * layer.dW
226 layer.b -= lr * layer.db
227
228 def fit(
229 self,
230 X: np.ndarray,
231 y: np.ndarray,
232 epochs: int = 100,
233 lr: float = 0.01,
234 batch_size: int = 32,
235 verbose: bool = True
236 ) -> list:
237 """
238 ๋ชจ๋ธ ํ์ต
239
240 Args:
241 X: ํ์ต ๋ฐ์ดํฐ (n_samples, n_features)
242 y: ๋ ์ด๋ธ (n_samples,) ๋๋ (n_samples, n_classes)
243 epochs: ์ํญ ์
244 lr: learning rate
245 batch_size: ๋ฐฐ์น ํฌ๊ธฐ
246 verbose: ์งํ ์ํฉ ์ถ๋ ฅ
247
248 Returns:
249 losses: ์ํญ๋ณ ์์ค ๋ฆฌ์คํธ
250 """
251 n_samples = X.shape[0]
252 losses = []
253
254 for epoch in range(epochs):
255 # ์
ํ
256 indices = np.random.permutation(n_samples)
257 X_shuffled = X[indices]
258 y_shuffled = y[indices] if y.ndim == 1 else y[indices]
259
260 epoch_loss = 0
261
262 # ๋ฏธ๋๋ฐฐ์น ํ์ต
263 for i in range(0, n_samples, batch_size):
264 X_batch = X_shuffled[i:i + batch_size]
265 y_batch = y_shuffled[i:i + batch_size]
266
267 # Forward
268 y_pred = self.forward(X_batch)
269
270 # Softmax (๋ง์ง๋ง ๋ ์ด์ด๊ฐ none์ผ ๊ฒฝ์ฐ)
271 y_pred = ActivationFunctions.softmax(y_pred)
272
273 # Loss
274 loss = self.compute_loss(y_batch, y_pred)
275 epoch_loss += loss * len(X_batch)
276
277 # Backward
278 loss_grad = self.compute_loss_gradient(y_batch, y_pred)
279 self.backward(loss_grad)
280
281 # Update
282 self.update_weights(lr)
283
284 epoch_loss /= n_samples
285 losses.append(epoch_loss)
286
287 if verbose and (epoch + 1) % (epochs // 10) == 0:
288 accuracy = self.evaluate(X, y)
289 print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.4f}")
290
291 return losses
292
293 def predict(self, X: np.ndarray) -> np.ndarray:
294 """์์ธก"""
295 logits = self.forward(X)
296 probs = ActivationFunctions.softmax(logits)
297 return np.argmax(probs, axis=1)
298
299 def evaluate(self, X: np.ndarray, y: np.ndarray) -> float:
300 """์ ํ๋ ํ๊ฐ"""
301 predictions = self.predict(X)
302 if y.ndim > 1:
303 y = np.argmax(y, axis=1)
304 return np.mean(predictions == y)
305
306
307def load_mnist_sample(n_samples=1000):
308 """MNIST ์ํ ๋ฐ์ดํฐ ์์ฑ (ํ
์คํธ์ฉ)"""
309 np.random.seed(42)
310
311 # ๊ฐ๋จํ ๊ฐ์ ๋ฐ์ดํฐ (์ค์ ๋ก๋ MNIST ๋ก๋)
312 n_classes = 10
313 n_features = 784 # 28x28
314
315 X = np.random.randn(n_samples, n_features) * 0.5
316 y = np.random.randint(0, n_classes, n_samples)
317
318 # ํด๋์ค๋ณ๋ก ์ฝ๊ฐ์ ํจํด ์ถ๊ฐ
319 for i in range(n_classes):
320 mask = y == i
321 X[mask, i * 78:(i + 1) * 78] += 1.0
322
323 return X, y
324
325
326def main():
327 """๋ฉ์ธ ์คํ ํจ์"""
328 print("=" * 60)
329 print("Multi-Layer Perceptron - NumPy From-Scratch ๊ตฌํ")
330 print("=" * 60)
331
332 # 1. ๋ฐ์ดํฐ ์์ฑ
333 print("\n1. ์ํ ๋ฐ์ดํฐ ์์ฑ")
334 X_train, y_train = load_mnist_sample(n_samples=1000)
335 X_test, y_test = load_mnist_sample(n_samples=200)
336 print(f" Train: {X_train.shape}, Test: {X_test.shape}")
337
338 # 2. ๋ชจ๋ธ ์์ฑ
339 print("\n2. MLP ๋ชจ๋ธ ์ด๊ธฐํ")
340 model = MLPNumpy(
341 layer_dims=[784, 128, 64, 10],
342 activations=['relu', 'relu', 'none']
343 )
344 print(f" Layers: {[l.W.shape for l in model.layers]}")
345
346 # 3. ํ์ต
347 print("\n3. ํ์ต ์์")
348 losses = model.fit(
349 X_train, y_train,
350 epochs=50,
351 lr=0.1,
352 batch_size=32,
353 verbose=True
354 )
355
356 # 4. ํ๊ฐ
357 print("\n4. ํ๊ฐ ๊ฒฐ๊ณผ")
358 train_acc = model.evaluate(X_train, y_train)
359 test_acc = model.evaluate(X_test, y_test)
360 print(f" Train Accuracy: {train_acc:.4f}")
361 print(f" Test Accuracy: {test_acc:.4f}")
362
363 # 5. ์๊ฐํ
364 try:
365 import matplotlib.pyplot as plt
366
367 fig, axes = plt.subplots(1, 2, figsize=(12, 4))
368
369 # Loss ๊ณก์
370 axes[0].plot(losses)
371 axes[0].set_xlabel('Epoch')
372 axes[0].set_ylabel('Loss')
373 axes[0].set_title('Training Loss')
374 axes[0].grid(True)
375
376 # ๊ฐ์ค์น ๋ถํฌ (์ฒซ ๋ฒ์งธ ๋ ์ด์ด)
377 axes[1].hist(model.layers[0].W.flatten(), bins=50, alpha=0.7)
378 axes[1].set_xlabel('Weight Value')
379 axes[1].set_ylabel('Frequency')
380 axes[1].set_title('First Layer Weight Distribution')
381 axes[1].grid(True)
382
383 plt.tight_layout()
384 plt.savefig('mlp_result.png', dpi=150)
385 plt.show()
386 print("\n๊ฒฐ๊ณผ ์ด๋ฏธ์ง ์ ์ฅ: mlp_result.png")
387
388 except ImportError:
389 print("\n(matplotlib ์์, ์๊ฐํ ์๋ต)")
390
391
392if __name__ == "__main__":
393 main()