mlp_numpy.py

Download
python 394 lines 11.1 KB
  1"""
  2Multi-Layer Perceptron - NumPy From-Scratch ๊ตฌํ˜„
  3
  4์ด ํŒŒ์ผ์€ MLP๋ฅผ ์ˆœ์ˆ˜ NumPy๋กœ ๊ตฌํ˜„ํ•ฉ๋‹ˆ๋‹ค.
  5Backpropagation ์•Œ๊ณ ๋ฆฌ์ฆ˜์„ ์ง์ ‘ ๊ตฌํ˜„ํ•˜์—ฌ
  6๋”ฅ๋Ÿฌ๋‹์˜ ํ•ต์‹ฌ ์›๋ฆฌ๋ฅผ ์ดํ•ดํ•ฉ๋‹ˆ๋‹ค.
  7
  8ํ•™์Šต ๋ชฉํ‘œ:
  91. Forward pass: ๋‹ค์ธต ์‹ ๊ฒฝ๋ง์˜ ์ˆœ์ „ํŒŒ
 102. Backward pass: Chain rule์„ ์ด์šฉํ•œ ์—ญ์ „ํŒŒ
 113. Activation functions: ReLU, Sigmoid, Tanh
 124. Weight initialization: Xavier, He ์ดˆ๊ธฐํ™”
 13"""
 14
 15import numpy as np
 16
 17
 18class ActivationFunctions:
 19    """ํ™œ์„ฑํ™” ํ•จ์ˆ˜์™€ ๊ทธ ๋ฏธ๋ถ„"""
 20
 21    @staticmethod
 22    def relu(z):
 23        return np.maximum(0, z)
 24
 25    @staticmethod
 26    def relu_derivative(z):
 27        return (z > 0).astype(float)
 28
 29    @staticmethod
 30    def sigmoid(z):
 31        # ์ˆ˜์น˜ ์•ˆ์ •์„ฑ์„ ์œ„ํ•ด ํด๋ฆฌํ•‘
 32        z = np.clip(z, -500, 500)
 33        return 1 / (1 + np.exp(-z))
 34
 35    @staticmethod
 36    def sigmoid_derivative(z):
 37        s = ActivationFunctions.sigmoid(z)
 38        return s * (1 - s)
 39
 40    @staticmethod
 41    def tanh(z):
 42        return np.tanh(z)
 43
 44    @staticmethod
 45    def tanh_derivative(z):
 46        return 1 - np.tanh(z) ** 2
 47
 48    @staticmethod
 49    def softmax(z):
 50        # ์ˆ˜์น˜ ์•ˆ์ •์„ฑ: ์ตœ๋Œ€๊ฐ’์„ ๋นผ์คŒ
 51        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
 52        return exp_z / np.sum(exp_z, axis=1, keepdims=True)
 53
 54
 55class Layer:
 56    """
 57    ๋‹จ์ผ Fully Connected Layer
 58
 59    z = Wx + b (์„ ํ˜• ๋ณ€ํ™˜)
 60    a = ฯƒ(z)   (ํ™œ์„ฑํ™”)
 61    """
 62
 63    def __init__(self, input_dim: int, output_dim: int, activation: str = 'relu'):
 64        """
 65        Args:
 66            input_dim: ์ž…๋ ฅ ์ฐจ์›
 67            output_dim: ์ถœ๋ ฅ ์ฐจ์›
 68            activation: 'relu', 'sigmoid', 'tanh', 'none'
 69        """
 70        # He ์ดˆ๊ธฐํ™” (ReLU์šฉ)
 71        if activation == 'relu':
 72            self.W = np.random.randn(input_dim, output_dim) * np.sqrt(2.0 / input_dim)
 73        else:
 74            # Xavier ์ดˆ๊ธฐํ™”
 75            self.W = np.random.randn(input_dim, output_dim) * np.sqrt(1.0 / input_dim)
 76
 77        self.b = np.zeros((1, output_dim))
 78
 79        self.activation = activation
 80        self._get_activation_fn()
 81
 82        # Gradients
 83        self.dW = None
 84        self.db = None
 85
 86        # Cache (for backward)
 87        self.cache = {}
 88
 89    def _get_activation_fn(self):
 90        """ํ™œ์„ฑํ™” ํ•จ์ˆ˜ ์„ค์ •"""
 91        activations = {
 92            'relu': (ActivationFunctions.relu, ActivationFunctions.relu_derivative),
 93            'sigmoid': (ActivationFunctions.sigmoid, ActivationFunctions.sigmoid_derivative),
 94            'tanh': (ActivationFunctions.tanh, ActivationFunctions.tanh_derivative),
 95            'none': (lambda x: x, lambda x: np.ones_like(x)),
 96        }
 97        self.act_fn, self.act_derivative = activations[self.activation]
 98
 99    def forward(self, x: np.ndarray) -> np.ndarray:
100        """
101        Forward pass
102
103        Args:
104            x: ์ž…๋ ฅ (batch_size, input_dim)
105
106        Returns:
107            a: ํ™œ์„ฑํ™” ์ถœ๋ ฅ (batch_size, output_dim)
108        """
109        # ์บ์‹œ ์ €์žฅ (backward์—์„œ ์‚ฌ์šฉ)
110        self.cache['x'] = x
111
112        # ์„ ํ˜• ๋ณ€ํ™˜: z = Wx + b
113        z = np.dot(x, self.W) + self.b
114        self.cache['z'] = z
115
116        # ํ™œ์„ฑํ™”: a = ฯƒ(z)
117        a = self.act_fn(z)
118        self.cache['a'] = a
119
120        return a
121
122    def backward(self, da: np.ndarray) -> np.ndarray:
123        """
124        Backward pass
125
126        Args:
127            da: ์ถœ๋ ฅ์˜ gradient (batch_size, output_dim)
128
129        Returns:
130            dx: ์ž…๋ ฅ์˜ gradient (batch_size, input_dim)
131        """
132        x = self.cache['x']
133        z = self.cache['z']
134        batch_size = x.shape[0]
135
136        # โˆ‚L/โˆ‚z = โˆ‚L/โˆ‚a ร— โˆ‚a/โˆ‚z = da ร— ฯƒ'(z)
137        dz = da * self.act_derivative(z)
138
139        # โˆ‚L/โˆ‚W = x^T ร— โˆ‚L/โˆ‚z
140        self.dW = np.dot(x.T, dz) / batch_size
141
142        # โˆ‚L/โˆ‚b = sum(โˆ‚L/โˆ‚z)
143        self.db = np.sum(dz, axis=0, keepdims=True) / batch_size
144
145        # โˆ‚L/โˆ‚x = โˆ‚L/โˆ‚z ร— W^T (๋‹ค์Œ ๋ ˆ์ด์–ด๋กœ ์ „ํŒŒ)
146        dx = np.dot(dz, self.W.T)
147
148        return dx
149
150
151class MLPNumpy:
152    """
153    Multi-Layer Perceptron (NumPy ๊ตฌํ˜„)
154
155    ์‚ฌ์šฉ ์˜ˆ:
156        model = MLPNumpy([784, 256, 128, 10], activations=['relu', 'relu', 'none'])
157        model.fit(X_train, y_train)
158        predictions = model.predict(X_test)
159    """
160
161    def __init__(self, layer_dims: list, activations: list = None):
162        """
163        Args:
164            layer_dims: ๊ฐ ๋ ˆ์ด์–ด์˜ ์ฐจ์› [input, hidden1, hidden2, ..., output]
165            activations: ๊ฐ ๋ ˆ์ด์–ด์˜ ํ™œ์„ฑํ™” ํ•จ์ˆ˜ (๋งˆ์ง€๋ง‰ ๋ ˆ์ด์–ด ์ œ์™ธ)
166        """
167        self.layers = []
168        n_layers = len(layer_dims) - 1
169
170        if activations is None:
171            activations = ['relu'] * (n_layers - 1) + ['none']
172
173        for i in range(n_layers):
174            layer = Layer(layer_dims[i], layer_dims[i + 1], activations[i])
175            self.layers.append(layer)
176
177    def forward(self, x: np.ndarray) -> np.ndarray:
178        """์ „์ฒด ๋„คํŠธ์›Œํฌ forward pass"""
179        for layer in self.layers:
180            x = layer.forward(x)
181        return x
182
183    def backward(self, loss_grad: np.ndarray) -> None:
184        """์ „์ฒด ๋„คํŠธ์›Œํฌ backward pass"""
185        grad = loss_grad
186        for layer in reversed(self.layers):
187            grad = layer.backward(grad)
188
189    def compute_loss(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
190        """
191        Cross-entropy loss (๋ถ„๋ฅ˜์šฉ)
192
193        L = -1/n ร— ฮฃ y_true ร— log(y_pred)
194        """
195        eps = 1e-15  # ์ˆ˜์น˜ ์•ˆ์ •์„ฑ
196        y_pred = np.clip(y_pred, eps, 1 - eps)
197
198        if y_true.ndim == 1:
199            # Sparse labels โ†’ one-hot
200            n_classes = y_pred.shape[1]
201            y_true_onehot = np.zeros((len(y_true), n_classes))
202            y_true_onehot[np.arange(len(y_true)), y_true] = 1
203            y_true = y_true_onehot
204
205        loss = -np.mean(np.sum(y_true * np.log(y_pred), axis=1))
206        return loss
207
208    def compute_loss_gradient(self, y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
209        """
210        Cross-entropy gradient (softmax ์ถœ๋ ฅ ๊ฐ€์ •)
211
212        โˆ‚L/โˆ‚z = y_pred - y_true (softmax + CE์˜ ๊ฒฝ์šฐ ๊ฐ„๋‹จํ•ด์ง)
213        """
214        if y_true.ndim == 1:
215            n_classes = y_pred.shape[1]
216            y_true_onehot = np.zeros((len(y_true), n_classes))
217            y_true_onehot[np.arange(len(y_true)), y_true] = 1
218            y_true = y_true_onehot
219
220        return y_pred - y_true
221
222    def update_weights(self, lr: float) -> None:
223        """SGD ๊ฐ€์ค‘์น˜ ์—…๋ฐ์ดํŠธ"""
224        for layer in self.layers:
225            layer.W -= lr * layer.dW
226            layer.b -= lr * layer.db
227
228    def fit(
229        self,
230        X: np.ndarray,
231        y: np.ndarray,
232        epochs: int = 100,
233        lr: float = 0.01,
234        batch_size: int = 32,
235        verbose: bool = True
236    ) -> list:
237        """
238        ๋ชจ๋ธ ํ•™์Šต
239
240        Args:
241            X: ํ•™์Šต ๋ฐ์ดํ„ฐ (n_samples, n_features)
242            y: ๋ ˆ์ด๋ธ” (n_samples,) ๋˜๋Š” (n_samples, n_classes)
243            epochs: ์—ํญ ์ˆ˜
244            lr: learning rate
245            batch_size: ๋ฐฐ์น˜ ํฌ๊ธฐ
246            verbose: ์ง„ํ–‰ ์ƒํ™ฉ ์ถœ๋ ฅ
247
248        Returns:
249            losses: ์—ํญ๋ณ„ ์†์‹ค ๋ฆฌ์ŠคํŠธ
250        """
251        n_samples = X.shape[0]
252        losses = []
253
254        for epoch in range(epochs):
255            # ์…”ํ”Œ
256            indices = np.random.permutation(n_samples)
257            X_shuffled = X[indices]
258            y_shuffled = y[indices] if y.ndim == 1 else y[indices]
259
260            epoch_loss = 0
261
262            # ๋ฏธ๋‹ˆ๋ฐฐ์น˜ ํ•™์Šต
263            for i in range(0, n_samples, batch_size):
264                X_batch = X_shuffled[i:i + batch_size]
265                y_batch = y_shuffled[i:i + batch_size]
266
267                # Forward
268                y_pred = self.forward(X_batch)
269
270                # Softmax (๋งˆ์ง€๋ง‰ ๋ ˆ์ด์–ด๊ฐ€ none์ผ ๊ฒฝ์šฐ)
271                y_pred = ActivationFunctions.softmax(y_pred)
272
273                # Loss
274                loss = self.compute_loss(y_batch, y_pred)
275                epoch_loss += loss * len(X_batch)
276
277                # Backward
278                loss_grad = self.compute_loss_gradient(y_batch, y_pred)
279                self.backward(loss_grad)
280
281                # Update
282                self.update_weights(lr)
283
284            epoch_loss /= n_samples
285            losses.append(epoch_loss)
286
287            if verbose and (epoch + 1) % (epochs // 10) == 0:
288                accuracy = self.evaluate(X, y)
289                print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.4f}")
290
291        return losses
292
293    def predict(self, X: np.ndarray) -> np.ndarray:
294        """์˜ˆ์ธก"""
295        logits = self.forward(X)
296        probs = ActivationFunctions.softmax(logits)
297        return np.argmax(probs, axis=1)
298
299    def evaluate(self, X: np.ndarray, y: np.ndarray) -> float:
300        """์ •ํ™•๋„ ํ‰๊ฐ€"""
301        predictions = self.predict(X)
302        if y.ndim > 1:
303            y = np.argmax(y, axis=1)
304        return np.mean(predictions == y)
305
306
307def load_mnist_sample(n_samples=1000):
308    """MNIST ์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ (ํ…Œ์ŠคํŠธ์šฉ)"""
309    np.random.seed(42)
310
311    # ๊ฐ„๋‹จํ•œ ๊ฐ€์ƒ ๋ฐ์ดํ„ฐ (์‹ค์ œ๋กœ๋Š” MNIST ๋กœ๋“œ)
312    n_classes = 10
313    n_features = 784  # 28x28
314
315    X = np.random.randn(n_samples, n_features) * 0.5
316    y = np.random.randint(0, n_classes, n_samples)
317
318    # ํด๋ž˜์Šค๋ณ„๋กœ ์•ฝ๊ฐ„์˜ ํŒจํ„ด ์ถ”๊ฐ€
319    for i in range(n_classes):
320        mask = y == i
321        X[mask, i * 78:(i + 1) * 78] += 1.0
322
323    return X, y
324
325
326def main():
327    """๋ฉ”์ธ ์‹คํ–‰ ํ•จ์ˆ˜"""
328    print("=" * 60)
329    print("Multi-Layer Perceptron - NumPy From-Scratch ๊ตฌํ˜„")
330    print("=" * 60)
331
332    # 1. ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
333    print("\n1. ์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ")
334    X_train, y_train = load_mnist_sample(n_samples=1000)
335    X_test, y_test = load_mnist_sample(n_samples=200)
336    print(f"   Train: {X_train.shape}, Test: {X_test.shape}")
337
338    # 2. ๋ชจ๋ธ ์ƒ์„ฑ
339    print("\n2. MLP ๋ชจ๋ธ ์ดˆ๊ธฐํ™”")
340    model = MLPNumpy(
341        layer_dims=[784, 128, 64, 10],
342        activations=['relu', 'relu', 'none']
343    )
344    print(f"   Layers: {[l.W.shape for l in model.layers]}")
345
346    # 3. ํ•™์Šต
347    print("\n3. ํ•™์Šต ์‹œ์ž‘")
348    losses = model.fit(
349        X_train, y_train,
350        epochs=50,
351        lr=0.1,
352        batch_size=32,
353        verbose=True
354    )
355
356    # 4. ํ‰๊ฐ€
357    print("\n4. ํ‰๊ฐ€ ๊ฒฐ๊ณผ")
358    train_acc = model.evaluate(X_train, y_train)
359    test_acc = model.evaluate(X_test, y_test)
360    print(f"   Train Accuracy: {train_acc:.4f}")
361    print(f"   Test Accuracy: {test_acc:.4f}")
362
363    # 5. ์‹œ๊ฐํ™”
364    try:
365        import matplotlib.pyplot as plt
366
367        fig, axes = plt.subplots(1, 2, figsize=(12, 4))
368
369        # Loss ๊ณก์„ 
370        axes[0].plot(losses)
371        axes[0].set_xlabel('Epoch')
372        axes[0].set_ylabel('Loss')
373        axes[0].set_title('Training Loss')
374        axes[0].grid(True)
375
376        # ๊ฐ€์ค‘์น˜ ๋ถ„ํฌ (์ฒซ ๋ฒˆ์งธ ๋ ˆ์ด์–ด)
377        axes[1].hist(model.layers[0].W.flatten(), bins=50, alpha=0.7)
378        axes[1].set_xlabel('Weight Value')
379        axes[1].set_ylabel('Frequency')
380        axes[1].set_title('First Layer Weight Distribution')
381        axes[1].grid(True)
382
383        plt.tight_layout()
384        plt.savefig('mlp_result.png', dpi=150)
385        plt.show()
386        print("\n๊ฒฐ๊ณผ ์ด๋ฏธ์ง€ ์ €์žฅ: mlp_result.png")
387
388    except ImportError:
389        print("\n(matplotlib ์—†์Œ, ์‹œ๊ฐํ™” ์ƒ๋žต)")
390
391
392if __name__ == "__main__":
393    main()