mlp_numpy.py - Examples

  1"""
  2Multi-Layer Perceptron - NumPy From-Scratch 구현
  3
  4이 파일은 MLP를 순수 NumPy로 구현합니다.
  5Backpropagation 알고리즘을 직접 구현하여
  6딥러닝의 핵심 원리를 이해합니다.
  7
  8학습 목표:
  91. Forward pass: 다층 신경망의 순전파
 102. Backward pass: Chain rule을 이용한 역전파
 113. Activation functions: ReLU, Sigmoid, Tanh
 124. Weight initialization: Xavier, He 초기화
 13"""
 14
 15import numpy as np
 16
 17
 18class ActivationFunctions:
 19    """활성화 함수와 그 미분"""
 20
 21    @staticmethod
 22    def relu(z):
 23        return np.maximum(0, z)
 24
 25    @staticmethod
 26    def relu_derivative(z):
 27        return (z > 0).astype(float)
 28
 29    @staticmethod
 30    def sigmoid(z):
 31        # 수치 안정성을 위해 클리핑
 32        z = np.clip(z, -500, 500)
 33        return 1 / (1 + np.exp(-z))
 34
 35    @staticmethod
 36    def sigmoid_derivative(z):
 37        s = ActivationFunctions.sigmoid(z)
 38        return s * (1 - s)
 39
 40    @staticmethod
 41    def tanh(z):
 42        return np.tanh(z)
 43
 44    @staticmethod
 45    def tanh_derivative(z):
 46        return 1 - np.tanh(z) ** 2
 47
 48    @staticmethod
 49    def softmax(z):
 50        # 수치 안정성: 최대값을 빼줌
 51        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
 52        return exp_z / np.sum(exp_z, axis=1, keepdims=True)
 53
 54
 55class Layer:
 56    """
 57    단일 Fully Connected Layer
 58
 59    z = Wx + b (선형 변환)
 60    a = σ(z)   (활성화)
 61    """
 62
 63    def __init__(self, input_dim: int, output_dim: int, activation: str = 'relu'):
 64        """
 65        Args:
 66            input_dim: 입력 차원
 67            output_dim: 출력 차원
 68            activation: 'relu', 'sigmoid', 'tanh', 'none'
 69        """
 70        # He 초기화 (ReLU용)
 71        if activation == 'relu':
 72            self.W = np.random.randn(input_dim, output_dim) * np.sqrt(2.0 / input_dim)
 73        else:
 74            # Xavier 초기화
 75            self.W = np.random.randn(input_dim, output_dim) * np.sqrt(1.0 / input_dim)
 76
 77        self.b = np.zeros((1, output_dim))
 78
 79        self.activation = activation
 80        self._get_activation_fn()
 81
 82        # Gradients
 83        self.dW = None
 84        self.db = None
 85
 86        # Cache (for backward)
 87        self.cache = {}
 88
 89    def _get_activation_fn(self):
 90        """활성화 함수 설정"""
 91        activations = {
 92            'relu': (ActivationFunctions.relu, ActivationFunctions.relu_derivative),
 93            'sigmoid': (ActivationFunctions.sigmoid, ActivationFunctions.sigmoid_derivative),
 94            'tanh': (ActivationFunctions.tanh, ActivationFunctions.tanh_derivative),
 95            'none': (lambda x: x, lambda x: np.ones_like(x)),
 96        }
 97        self.act_fn, self.act_derivative = activations[self.activation]
 98
 99    def forward(self, x: np.ndarray) -> np.ndarray:
100        """
101        Forward pass
102
103        Args:
104            x: 입력 (batch_size, input_dim)
105
106        Returns:
107            a: 활성화 출력 (batch_size, output_dim)
108        """
109        # 캐시 저장 (backward에서 사용)
110        self.cache['x'] = x
111
112        # 선형 변환: z = Wx + b
113        z = np.dot(x, self.W) + self.b
114        self.cache['z'] = z
115
116        # 활성화: a = σ(z)
117        a = self.act_fn(z)
118        self.cache['a'] = a
119
120        return a
121
122    def backward(self, da: np.ndarray) -> np.ndarray:
123        """
124        Backward pass
125
126        Args:
127            da: 출력의 gradient (batch_size, output_dim)
128
129        Returns:
130            dx: 입력의 gradient (batch_size, input_dim)
131        """
132        x = self.cache['x']
133        z = self.cache['z']
134        batch_size = x.shape[0]
135
136        # ∂L/∂z = ∂L/∂a × ∂a/∂z = da × σ'(z)
137        dz = da * self.act_derivative(z)
138
139        # ∂L/∂W = x^T × ∂L/∂z
140        self.dW = np.dot(x.T, dz) / batch_size
141
142        # ∂L/∂b = sum(∂L/∂z)
143        self.db = np.sum(dz, axis=0, keepdims=True) / batch_size
144
145        # ∂L/∂x = ∂L/∂z × W^T (다음 레이어로 전파)
146        dx = np.dot(dz, self.W.T)
147
148        return dx
149
150
151class MLPNumpy:
152    """
153    Multi-Layer Perceptron (NumPy 구현)
154
155    사용 예:
156        model = MLPNumpy([784, 256, 128, 10], activations=['relu', 'relu', 'none'])
157        model.fit(X_train, y_train)
158        predictions = model.predict(X_test)
159    """
160
161    def __init__(self, layer_dims: list, activations: list = None):
162        """
163        Args:
164            layer_dims: 각 레이어의 차원 [input, hidden1, hidden2, ..., output]
165            activations: 각 레이어의 활성화 함수 (마지막 레이어 제외)
166        """
167        self.layers = []
168        n_layers = len(layer_dims) - 1
169
170        if activations is None:
171            activations = ['relu'] * (n_layers - 1) + ['none']
172
173        for i in range(n_layers):
174            layer = Layer(layer_dims[i], layer_dims[i + 1], activations[i])
175            self.layers.append(layer)
176
177    def forward(self, x: np.ndarray) -> np.ndarray:
178        """전체 네트워크 forward pass"""
179        for layer in self.layers:
180            x = layer.forward(x)
181        return x
182
183    def backward(self, loss_grad: np.ndarray) -> None:
184        """전체 네트워크 backward pass"""
185        grad = loss_grad
186        for layer in reversed(self.layers):
187            grad = layer.backward(grad)
188
189    def compute_loss(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
190        """
191        Cross-entropy loss (분류용)
192
193        L = -1/n × Σ y_true × log(y_pred)
194        """
195        eps = 1e-15  # 수치 안정성
196        y_pred = np.clip(y_pred, eps, 1 - eps)
197
198        if y_true.ndim == 1:
199            # Sparse labels → one-hot
200            n_classes = y_pred.shape[1]
201            y_true_onehot = np.zeros((len(y_true), n_classes))
202            y_true_onehot[np.arange(len(y_true)), y_true] = 1
203            y_true = y_true_onehot
204
205        loss = -np.mean(np.sum(y_true * np.log(y_pred), axis=1))
206        return loss
207
208    def compute_loss_gradient(self, y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
209        """
210        Cross-entropy gradient (softmax 출력 가정)
211
212        ∂L/∂z = y_pred - y_true (softmax + CE의 경우 간단해짐)
213        """
214        if y_true.ndim == 1:
215            n_classes = y_pred.shape[1]
216            y_true_onehot = np.zeros((len(y_true), n_classes))
217            y_true_onehot[np.arange(len(y_true)), y_true] = 1
218            y_true = y_true_onehot
219
220        return y_pred - y_true
221
222    def update_weights(self, lr: float) -> None:
223        """SGD 가중치 업데이트"""
224        for layer in self.layers:
225            layer.W -= lr * layer.dW
226            layer.b -= lr * layer.db
227
228    def fit(
229        self,
230        X: np.ndarray,
231        y: np.ndarray,
232        epochs: int = 100,
233        lr: float = 0.01,
234        batch_size: int = 32,
235        verbose: bool = True
236    ) -> list:
237        """
238        모델 학습
239
240        Args:
241            X: 학습 데이터 (n_samples, n_features)
242            y: 레이블 (n_samples,) 또는 (n_samples, n_classes)
243            epochs: 에폭 수
244            lr: learning rate
245            batch_size: 배치 크기
246            verbose: 진행 상황 출력
247
248        Returns:
249            losses: 에폭별 손실 리스트
250        """
251        n_samples = X.shape[0]
252        losses = []
253
254        for epoch in range(epochs):
255            # 셔플
256            indices = np.random.permutation(n_samples)
257            X_shuffled = X[indices]
258            y_shuffled = y[indices] if y.ndim == 1 else y[indices]
259
260            epoch_loss = 0
261
262            # 미니배치 학습
263            for i in range(0, n_samples, batch_size):
264                X_batch = X_shuffled[i:i + batch_size]
265                y_batch = y_shuffled[i:i + batch_size]
266
267                # Forward
268                y_pred = self.forward(X_batch)
269
270                # Softmax (마지막 레이어가 none일 경우)
271                y_pred = ActivationFunctions.softmax(y_pred)
272
273                # Loss
274                loss = self.compute_loss(y_batch, y_pred)
275                epoch_loss += loss * len(X_batch)
276
277                # Backward
278                loss_grad = self.compute_loss_gradient(y_batch, y_pred)
279                self.backward(loss_grad)
280
281                # Update
282                self.update_weights(lr)
283
284            epoch_loss /= n_samples
285            losses.append(epoch_loss)
286
287            if verbose and (epoch + 1) % (epochs // 10) == 0:
288                accuracy = self.evaluate(X, y)
289                print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.4f}")
290
291        return losses
292
293    def predict(self, X: np.ndarray) -> np.ndarray:
294        """예측"""
295        logits = self.forward(X)
296        probs = ActivationFunctions.softmax(logits)
297        return np.argmax(probs, axis=1)
298
299    def evaluate(self, X: np.ndarray, y: np.ndarray) -> float:
300        """정확도 평가"""
301        predictions = self.predict(X)
302        if y.ndim > 1:
303            y = np.argmax(y, axis=1)
304        return np.mean(predictions == y)
305
306
307def load_mnist_sample(n_samples=1000):
308    """MNIST 샘플 데이터 생성 (테스트용)"""
309    np.random.seed(42)
310
311    # 간단한 가상 데이터 (실제로는 MNIST 로드)
312    n_classes = 10
313    n_features = 784  # 28x28
314
315    X = np.random.randn(n_samples, n_features) * 0.5
316    y = np.random.randint(0, n_classes, n_samples)
317
318    # 클래스별로 약간의 패턴 추가
319    for i in range(n_classes):
320        mask = y == i
321        X[mask, i * 78:(i + 1) * 78] += 1.0
322
323    return X, y
324
325
326def main():
327    """메인 실행 함수"""
328    print("=" * 60)
329    print("Multi-Layer Perceptron - NumPy From-Scratch 구현")
330    print("=" * 60)
331
332    # 1. 데이터 생성
333    print("\n1. 샘플 데이터 생성")
334    X_train, y_train = load_mnist_sample(n_samples=1000)
335    X_test, y_test = load_mnist_sample(n_samples=200)
336    print(f"   Train: {X_train.shape}, Test: {X_test.shape}")
337
338    # 2. 모델 생성
339    print("\n2. MLP 모델 초기화")
340    model = MLPNumpy(
341        layer_dims=[784, 128, 64, 10],
342        activations=['relu', 'relu', 'none']
343    )
344    print(f"   Layers: {[l.W.shape for l in model.layers]}")
345
346    # 3. 학습
347    print("\n3. 학습 시작")
348    losses = model.fit(
349        X_train, y_train,
350        epochs=50,
351        lr=0.1,
352        batch_size=32,
353        verbose=True
354    )
355
356    # 4. 평가
357    print("\n4. 평가 결과")
358    train_acc = model.evaluate(X_train, y_train)
359    test_acc = model.evaluate(X_test, y_test)
360    print(f"   Train Accuracy: {train_acc:.4f}")
361    print(f"   Test Accuracy: {test_acc:.4f}")
362
363    # 5. 시각화
364    try:
365        import matplotlib.pyplot as plt
366
367        fig, axes = plt.subplots(1, 2, figsize=(12, 4))
368
369        # Loss 곡선
370        axes[0].plot(losses)
371        axes[0].set_xlabel('Epoch')
372        axes[0].set_ylabel('Loss')
373        axes[0].set_title('Training Loss')
374        axes[0].grid(True)
375
376        # 가중치 분포 (첫 번째 레이어)
377        axes[1].hist(model.layers[0].W.flatten(), bins=50, alpha=0.7)
378        axes[1].set_xlabel('Weight Value')
379        axes[1].set_ylabel('Frequency')
380        axes[1].set_title('First Layer Weight Distribution')
381        axes[1].grid(True)
382
383        plt.tight_layout()
384        plt.savefig('mlp_result.png', dpi=150)
385        plt.show()
386        print("\n결과 이미지 저장: mlp_result.png")
387
388    except ImportError:
389        print("\n(matplotlib 없음, 시각화 생략)")
390
391
392if __name__ == "__main__":
393    main()