09_lstm_gru.py - Examples

  1"""
  209. LSTM과 GRU
  3
  4LSTM과 GRU의 구현과 활용을 학습합니다.
  5"""
  6
  7import torch
  8import torch.nn as nn
  9import torch.nn.functional as F
 10import numpy as np
 11import matplotlib.pyplot as plt
 12
 13print("=" * 60)
 14print("PyTorch LSTM/GRU")
 15print("=" * 60)
 16
 17device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 18
 19
 20# ============================================
 21# 1. LSTM 기본
 22# ============================================
 23print("\n[1] LSTM 기본")
 24print("-" * 40)
 25
 26lstm = nn.LSTM(
 27    input_size=10,
 28    hidden_size=20,
 29    num_layers=2,
 30    batch_first=True,
 31    dropout=0.1
 32)
 33
 34# 입력
 35x = torch.randn(4, 8, 10)  # (batch, seq, features)
 36
 37# 순전파
 38output, (h_n, c_n) = lstm(x)
 39
 40print(f"입력: {x.shape}")
 41print(f"output: {output.shape}")  # (4, 8, 20)
 42print(f"h_n (은닉): {h_n.shape}")  # (2, 4, 20)
 43print(f"c_n (셀): {c_n.shape}")    # (2, 4, 20)
 44
 45# 초기 상태 지정
 46h0 = torch.zeros(2, 4, 20)
 47c0 = torch.zeros(2, 4, 20)
 48output, (h_n, c_n) = lstm(x, (h0, c0))
 49print(f"\n초기 상태 지정: h0={h0.shape}, c0={c0.shape}")
 50
 51
 52# ============================================
 53# 2. GRU 기본
 54# ============================================
 55print("\n[2] GRU 기본")
 56print("-" * 40)
 57
 58gru = nn.GRU(
 59    input_size=10,
 60    hidden_size=20,
 61    num_layers=2,
 62    batch_first=True
 63)
 64
 65output, h_n = gru(x)
 66
 67print(f"GRU output: {output.shape}")
 68print(f"GRU h_n: {h_n.shape}")  # 셀 상태 없음
 69
 70
 71# ============================================
 72# 3. 양방향 LSTM
 73# ============================================
 74print("\n[3] 양방향 LSTM")
 75print("-" * 40)
 76
 77lstm_bi = nn.LSTM(
 78    input_size=10,
 79    hidden_size=20,
 80    num_layers=2,
 81    batch_first=True,
 82    bidirectional=True
 83)
 84
 85output_bi, (h_n_bi, c_n_bi) = lstm_bi(x)
 86
 87print(f"양방향 LSTM:")
 88print(f"  output: {output_bi.shape}")  # (4, 8, 40)
 89print(f"  h_n: {h_n_bi.shape}")        # (4, 4, 20)
 90
 91# 정방향/역방향 분리
 92forward_out = output_bi[:, :, :20]
 93backward_out = output_bi[:, :, 20:]
 94print(f"  정방향: {forward_out.shape}")
 95print(f"  역방향: {backward_out.shape}")
 96
 97
 98# ============================================
 99# 4. LSTM 분류기
100# ============================================
101print("\n[4] LSTM 분류기")
102print("-" * 40)
103
104class LSTMClassifier(nn.Module):
105    def __init__(self, input_size, hidden_size, num_classes,
106                 num_layers=2, bidirectional=True, dropout=0.3):
107        super().__init__()
108        self.hidden_size = hidden_size
109        self.num_layers = num_layers
110        self.bidirectional = bidirectional
111        self.num_directions = 2 if bidirectional else 1
112
113        self.lstm = nn.LSTM(
114            input_size, hidden_size,
115            num_layers=num_layers,
116            batch_first=True,
117            bidirectional=bidirectional,
118            dropout=dropout if num_layers > 1 else 0
119        )
120
121        self.dropout = nn.Dropout(dropout)
122        self.fc = nn.Linear(hidden_size * self.num_directions, num_classes)
123
124    def forward(self, x):
125        # x: (batch, seq, features)
126        output, (h_n, c_n) = self.lstm(x)
127
128        # 마지막 층의 은닉 상태 결합
129        if self.bidirectional:
130            # 정방향 마지막 + 역방향 마지막
131            forward_last = h_n[-2]
132            backward_last = h_n[-1]
133            combined = torch.cat([forward_last, backward_last], dim=1)
134        else:
135            combined = h_n[-1]
136
137        dropped = self.dropout(combined)
138        return self.fc(dropped)
139
140model = LSTMClassifier(input_size=10, hidden_size=32, num_classes=5)
141out = model(x)
142print(f"분류기 출력: {out.shape}")
143
144
145# ============================================
146# 5. 시계열 예측 비교 (RNN vs LSTM vs GRU)
147# ============================================
148print("\n[5] RNN vs LSTM vs GRU 비교")
149print("-" * 40)
150
151# 더 복잡한 시계열 데이터 생성
152def generate_complex_series(seq_len=100, n_samples=1000):
153    X, y = [], []
154    for _ in range(n_samples):
155        t = np.linspace(0, 10*np.pi, seq_len + 1)
156        # 복합 패턴: sin + 노이즈 + 추세
157        signal = np.sin(t) + 0.5*np.sin(3*t) + 0.1*t + np.random.randn(seq_len+1)*0.1
158        X.append(signal[:-1].reshape(-1, 1))
159        y.append(signal[-1])
160    return np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)
161
162X, y = generate_complex_series(seq_len=100, n_samples=2000)
163X_train, y_train = torch.from_numpy(X[:1600]), torch.from_numpy(y[:1600])
164X_test, y_test = torch.from_numpy(X[1600:]), torch.from_numpy(y[1600:])
165
166class TimeSeriesModel(nn.Module):
167    def __init__(self, model_type='lstm', hidden_size=64):
168        super().__init__()
169        if model_type == 'rnn':
170            self.rnn = nn.RNN(1, hidden_size, batch_first=True)
171        elif model_type == 'lstm':
172            self.rnn = nn.LSTM(1, hidden_size, batch_first=True)
173        elif model_type == 'gru':
174            self.rnn = nn.GRU(1, hidden_size, batch_first=True)
175
176        self.model_type = model_type
177        self.fc = nn.Linear(hidden_size, 1)
178
179    def forward(self, x):
180        if self.model_type == 'lstm':
181            _, (h_n, _) = self.rnn(x)
182        else:
183            _, h_n = self.rnn(x)
184        return self.fc(h_n[-1]).squeeze(-1)
185
186def train_model(model_type, epochs=30):
187    model = TimeSeriesModel(model_type).to(device)
188    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
189    criterion = nn.MSELoss()
190
191    train_loader = torch.utils.data.DataLoader(
192        torch.utils.data.TensorDataset(X_train, y_train),
193        batch_size=64, shuffle=True
194    )
195
196    losses = []
197    for epoch in range(epochs):
198        model.train()
199        epoch_loss = 0
200        for X_batch, y_batch in train_loader:
201            X_batch = X_batch.to(device)
202            y_batch = y_batch.to(device)
203
204            pred = model(X_batch)
205            loss = criterion(pred, y_batch)
206
207            optimizer.zero_grad()
208            loss.backward()
209            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
210            optimizer.step()
211
212            epoch_loss += loss.item()
213        losses.append(epoch_loss / len(train_loader))
214
215    # 테스트
216    model.eval()
217    with torch.no_grad():
218        test_pred = model(X_test.to(device))
219        test_loss = criterion(test_pred, y_test.to(device)).item()
220
221    return losses, test_loss
222
223# 비교 실행
224print("모델 학습 중...")
225results = {}
226for model_type in ['rnn', 'lstm', 'gru']:
227    losses, test_loss = train_model(model_type)
228    results[model_type] = {'losses': losses, 'test_loss': test_loss}
229    print(f"  {model_type.upper()}: Test MSE = {test_loss:.6f}")
230
231# 시각화
232plt.figure(figsize=(10, 5))
233for name, data in results.items():
234    plt.plot(data['losses'], label=f"{name.upper()} (test={data['test_loss']:.4f})")
235plt.xlabel('Epoch')
236plt.ylabel('Loss')
237plt.title('RNN vs LSTM vs GRU')
238plt.legend()
239plt.grid(True, alpha=0.3)
240plt.savefig('rnn_lstm_gru_comparison.png', dpi=100)
241plt.close()
242print("그래프 저장: rnn_lstm_gru_comparison.png")
243
244
245# ============================================
246# 6. 텍스트 분류 예제
247# ============================================
248print("\n[6] 텍스트 분류 예제")
249print("-" * 40)
250
251class TextClassifier(nn.Module):
252    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
253        super().__init__()
254        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
255        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True,
256                           bidirectional=True, num_layers=2, dropout=0.3)
257        self.fc = nn.Sequential(
258            nn.Dropout(0.5),
259            nn.Linear(hidden_dim * 2, num_classes)
260        )
261
262    def forward(self, x):
263        # x: (batch, seq) - 토큰 인덱스
264        embedded = self.embedding(x)
265        _, (h_n, _) = self.lstm(embedded)
266        combined = torch.cat([h_n[-2], h_n[-1]], dim=1)
267        return self.fc(combined)
268
269model = TextClassifier(vocab_size=10000, embed_dim=128,
270                       hidden_dim=256, num_classes=5)
271print(f"TextClassifier 파라미터: {sum(p.numel() for p in model.parameters()):,}")
272
273# 더미 입력
274x = torch.randint(0, 10000, (8, 50))  # 8 문장, 50 토큰
275out = model(x)
276print(f"입력: {x.shape} → 출력: {out.shape}")
277
278
279# ============================================
280# 7. 언어 모델 (텍스트 생성)
281# ============================================
282print("\n[7] 언어 모델")
283print("-" * 40)
284
285class CharLSTM(nn.Module):
286    """문자 수준 언어 모델"""
287    def __init__(self, vocab_size, embed_dim, hidden_dim):
288        super().__init__()
289        self.embedding = nn.Embedding(vocab_size, embed_dim)
290        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
291        self.fc = nn.Linear(hidden_dim, vocab_size)
292
293    def forward(self, x, hidden=None):
294        embedded = self.embedding(x)
295        output, hidden = self.lstm(embedded, hidden)
296        logits = self.fc(output)
297        return logits, hidden
298
299    def generate(self, start_tokens, max_len=50, temperature=1.0):
300        self.eval()
301        tokens = list(start_tokens)
302        hidden = None
303
304        with torch.no_grad():
305            for _ in range(max_len):
306                x = torch.tensor([[tokens[-1]]])
307                logits, hidden = self(x, hidden)
308
309                # Temperature sampling
310                probs = F.softmax(logits[0, -1] / temperature, dim=0)
311                next_token = torch.multinomial(probs, 1).item()
312                tokens.append(next_token)
313
314        return tokens
315
316char_lm = CharLSTM(vocab_size=128, embed_dim=64, hidden_dim=256)
317print(f"CharLSTM 파라미터: {sum(p.numel() for p in char_lm.parameters()):,}")
318
319# 생성 테스트
320generated = char_lm.generate([65, 66, 67], max_len=20)  # ABC...
321print(f"생성된 토큰: {generated[:10]}...")
322
323
324# ============================================
325# 8. LSTM 내부 시각화
326# ============================================
327print("\n[8] LSTM 게이트 분석")
328print("-" * 40)
329
330class LSTMWithGates(nn.Module):
331    """게이트 값을 반환하는 LSTM"""
332    def __init__(self, input_size, hidden_size):
333        super().__init__()
334        self.hidden_size = hidden_size
335        self.lstm_cell = nn.LSTMCell(input_size, hidden_size)
336
337    def forward(self, x):
338        batch_size, seq_len, _ = x.shape
339        h = torch.zeros(batch_size, self.hidden_size)
340        c = torch.zeros(batch_size, self.hidden_size)
341
342        outputs = []
343        gates = {'input': [], 'forget': [], 'output': []}
344
345        for t in range(seq_len):
346            h, c = self.lstm_cell(x[:, t], (h, c))
347            outputs.append(h)
348
349        return torch.stack(outputs, dim=1)
350
351# 테스트
352lstm_gates = LSTMWithGates(10, 20)
353x = torch.randn(1, 30, 10)
354out = lstm_gates(x)
355print(f"게이트 분석용 LSTM 출력: {out.shape}")
356
357
358# ============================================
359# 정리
360# ============================================
361print("\n" + "=" * 60)
362print("LSTM/GRU 정리")
363print("=" * 60)
364
365summary = """
366LSTM:
367    output, (h_n, c_n) = lstm(x)
368    - 셀 상태(c)로 장기 기억 유지
369    - Forget, Input, Output 게이트
370
371GRU:
372    output, h_n = gru(x)
373    - 셀 상태 없음, 더 단순
374    - Reset, Update 게이트
375
376분류 패턴:
377    # 양방향 LSTM
378    forward_last = h_n[-2]
379    backward_last = h_n[-1]
380    combined = torch.cat([forward_last, backward_last], dim=1)
381    output = fc(combined)
382
383텍스트 분류:
384    embedded = embedding(x)  # 토큰 → 벡터
385    _, (h_n, _) = lstm(embedded)
386    output = fc(h_n[-1])
387
388선택 기준:
389    - 긴 시퀀스, 복잡한 의존성 → LSTM
390    - 빠른 학습, 제한된 자원 → GRU
391    - 단순한 패턴 → RNN도 가능
392"""
393print(summary)
394print("=" * 60)