14_text_classification.py

  1"""
  214. 실전 텍스트 분류 프로젝트
  3
  4감성 분석을 위한 텍스트 분류 파이프라인 구현
  5"""
  6
  7import torch
  8import torch.nn as nn
  9import torch.nn.functional as F
 10from torch.utils.data import DataLoader, Dataset
 11import numpy as np
 12import matplotlib.pyplot as plt
 13from collections import Counter
 14import re
 15import math
 16
 17print("=" * 60)
 18print("실전 텍스트 분류 프로젝트")
 19print("=" * 60)
 20
 21device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 22print(f"Device: {device}")
 23
 24
 25# ============================================
 26# 1. 텍스트 전처리
 27# ============================================
 28print("\n[1] 텍스트 전처리")
 29print("-" * 40)
 30
 31def simple_tokenizer(text):
 32    """간단한 토크나이저"""
 33    text = text.lower()
 34    text = re.sub(r'[^\w\s]', '', text)
 35    return text.split()
 36
 37# 테스트
 38sample = "This is a SAMPLE sentence! With punctuation."
 39tokens = simple_tokenizer(sample)
 40print(f"원문: {sample}")
 41print(f"토큰: {tokens}")
 42
 43
 44# ============================================
 45# 2. 어휘 구축
 46# ============================================
 47print("\n[2] 어휘 구축")
 48print("-" * 40)
 49
 50class Vocabulary:
 51    """텍스트 어휘 사전"""
 52    def __init__(self, min_freq=2):
 53        self.word2idx = {'<pad>': 0, '<unk>': 1}
 54        self.idx2word = {0: '<pad>', 1: '<unk>'}
 55        self.word_freq = Counter()
 56        self.min_freq = min_freq
 57
 58    def build(self, texts, tokenizer):
 59        """어휘 구축"""
 60        for text in texts:
 61            tokens = tokenizer(text)
 62            self.word_freq.update(tokens)
 63
 64        idx = len(self.word2idx)
 65        for word, freq in self.word_freq.items():
 66            if freq >= self.min_freq and word not in self.word2idx:
 67                self.word2idx[word] = idx
 68                self.idx2word[idx] = word
 69                idx += 1
 70
 71        print(f"총 단어 수: {len(self.word_freq)}")
 72        print(f"어휘 크기 (min_freq={self.min_freq}): {len(self.word2idx)}")
 73
 74    def encode(self, text, tokenizer, max_len=None):
 75        """텍스트를 인덱스로 변환"""
 76        tokens = tokenizer(text)
 77        indices = [self.word2idx.get(t, self.word2idx['<unk>']) for t in tokens]
 78        if max_len:
 79            if len(indices) > max_len:
 80                indices = indices[:max_len]
 81            else:
 82                indices = indices + [self.word2idx['<pad>']] * (max_len - len(indices))
 83        return indices
 84
 85    def __len__(self):
 86        return len(self.word2idx)
 87
 88
 89# ============================================
 90# 3. 샘플 데이터셋
 91# ============================================
 92print("\n[3] 샘플 데이터셋 생성")
 93print("-" * 40)
 94
 95# 감성 분석용 샘플 데이터
 96positive_samples = [
 97    "This movie is absolutely amazing and wonderful",
 98    "I love this product it is fantastic",
 99    "Great experience highly recommended",
100    "Excellent quality and fast delivery",
101    "Best purchase I have ever made",
102    "Wonderful service and friendly staff",
103    "I am very happy with this item",
104    "Perfect product exactly what I needed",
105    "Amazing value for the price",
106    "Outstanding performance and quality",
107    "This is the best thing ever",
108    "Incredible movie I loved every minute",
109    "Superb quality and great design",
110    "Highly satisfied with my purchase",
111    "Fantastic product works perfectly",
112] * 50  # 750 samples
113
114negative_samples = [
115    "Terrible product do not buy",
116    "Worst experience of my life",
117    "Very disappointed with the quality",
118    "Complete waste of money",
119    "Poor customer service",
120    "The product broke after one day",
121    "I hate this movie it was boring",
122    "Never buying from here again",
123    "Awful quality and slow delivery",
124    "Extremely bad experience",
125    "This is the worst product ever",
126    "Horrible movie total waste of time",
127    "Very poor quality disappointed",
128    "Bad product not recommended",
129    "Terrible service will not return",
130] * 50  # 750 samples
131
132texts = positive_samples + negative_samples
133labels = [1] * len(positive_samples) + [0] * len(negative_samples)
134
135# 셔플
136indices = np.random.permutation(len(texts))
137texts = [texts[i] for i in indices]
138labels = [labels[i] for i in indices]
139
140print(f"총 샘플 수: {len(texts)}")
141print(f"긍정: {sum(labels)}, 부정: {len(labels) - sum(labels)}")
142
143# 어휘 구축
144vocab = Vocabulary(min_freq=2)
145vocab.build(texts, simple_tokenizer)
146
147
148# ============================================
149# 4. PyTorch Dataset
150# ============================================
151print("\n[4] PyTorch Dataset")
152print("-" * 40)
153
154class TextDataset(Dataset):
155    def __init__(self, texts, labels, vocab, tokenizer, max_len=50):
156        self.texts = texts
157        self.labels = labels
158        self.vocab = vocab
159        self.tokenizer = tokenizer
160        self.max_len = max_len
161
162    def __len__(self):
163        return len(self.texts)
164
165    def __getitem__(self, idx):
166        text = self.texts[idx]
167        label = self.labels[idx]
168        encoded = self.vocab.encode(text, self.tokenizer, self.max_len)
169        return torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.long)
170
171# 데이터 분할
172train_size = int(0.8 * len(texts))
173train_texts, test_texts = texts[:train_size], texts[train_size:]
174train_labels, test_labels = labels[:train_size], labels[train_size:]
175
176train_dataset = TextDataset(train_texts, train_labels, vocab, simple_tokenizer)
177test_dataset = TextDataset(test_texts, test_labels, vocab, simple_tokenizer)
178
179train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
180test_loader = DataLoader(test_dataset, batch_size=32)
181
182print(f"Train: {len(train_dataset)}, Test: {len(test_dataset)}")
183
184# 샘플 확인
185sample_x, sample_y = train_dataset[0]
186print(f"샘플 입력 shape: {sample_x.shape}")
187print(f"샘플 라벨: {sample_y.item()}")
188
189
190# ============================================
191# 5. 기본 텍스트 분류기 (임베딩 + 평균)
192# ============================================
193print("\n[5] 기본 텍스트 분류기")
194print("-" * 40)
195
196class SimpleClassifier(nn.Module):
197    """임베딩 평균 기반 분류기"""
198    def __init__(self, vocab_size, embed_dim, num_classes):
199        super().__init__()
200        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
201        self.fc = nn.Linear(embed_dim, num_classes)
202
203    def forward(self, x):
204        # x: (batch, seq_len)
205        embedded = self.embedding(x)  # (batch, seq, embed)
206        # 평균 풀링 (패딩 제외)
207        mask = (x != 0).unsqueeze(-1).float()
208        pooled = (embedded * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
209        return self.fc(pooled)
210
211simple_model = SimpleClassifier(len(vocab), embed_dim=64, num_classes=2)
212print(f"SimpleClassifier 파라미터: {sum(p.numel() for p in simple_model.parameters()):,}")
213
214
215# ============================================
216# 6. LSTM 분류기
217# ============================================
218print("\n[6] LSTM 분류기")
219print("-" * 40)
220
221class LSTMClassifier(nn.Module):
222    """양방향 LSTM 분류기"""
223    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes,
224                 num_layers=2, dropout=0.5):
225        super().__init__()
226        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
227
228        self.lstm = nn.LSTM(
229            embed_dim, hidden_dim,
230            num_layers=num_layers,
231            batch_first=True,
232            bidirectional=True,
233            dropout=dropout if num_layers > 1 else 0
234        )
235
236        self.fc = nn.Sequential(
237            nn.Dropout(dropout),
238            nn.Linear(hidden_dim * 2, hidden_dim),
239            nn.ReLU(),
240            nn.Dropout(dropout),
241            nn.Linear(hidden_dim, num_classes)
242        )
243
244    def forward(self, x):
245        embedded = self.embedding(x)
246        output, (h_n, c_n) = self.lstm(embedded)
247
248        # 양방향 마지막 은닉 상태 결합
249        forward_last = h_n[-2]
250        backward_last = h_n[-1]
251        combined = torch.cat([forward_last, backward_last], dim=1)
252
253        return self.fc(combined)
254
255lstm_model = LSTMClassifier(len(vocab), embed_dim=64, hidden_dim=128, num_classes=2)
256print(f"LSTMClassifier 파라미터: {sum(p.numel() for p in lstm_model.parameters()):,}")
257
258
259# ============================================
260# 7. Transformer 분류기
261# ============================================
262print("\n[7] Transformer 분류기")
263print("-" * 40)
264
265class PositionalEncoding(nn.Module):
266    def __init__(self, d_model, max_len=512, dropout=0.1):
267        super().__init__()
268        self.dropout = nn.Dropout(dropout)
269
270        pe = torch.zeros(max_len, d_model)
271        position = torch.arange(0, max_len).unsqueeze(1).float()
272        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
273
274        pe[:, 0::2] = torch.sin(position * div_term)
275        pe[:, 1::2] = torch.cos(position * div_term)
276        pe = pe.unsqueeze(0)
277
278        self.register_buffer('pe', pe)
279
280    def forward(self, x):
281        x = x + self.pe[:, :x.size(1)]
282        return self.dropout(x)
283
284
285class TransformerClassifier(nn.Module):
286    """Transformer 인코더 기반 분류기"""
287    def __init__(self, vocab_size, embed_dim, num_heads, num_layers,
288                 num_classes, max_len=512, dropout=0.1):
289        super().__init__()
290        self.embed_dim = embed_dim
291        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
292        self.pos_encoder = PositionalEncoding(embed_dim, max_len, dropout)
293
294        encoder_layer = nn.TransformerEncoderLayer(
295            d_model=embed_dim,
296            nhead=num_heads,
297            dim_feedforward=embed_dim * 4,
298            dropout=dropout,
299            batch_first=True
300        )
301        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
302
303        self.fc = nn.Sequential(
304            nn.Dropout(dropout),
305            nn.Linear(embed_dim, num_classes)
306        )
307
308    def forward(self, x):
309        # 패딩 마스크 생성
310        padding_mask = (x == 0)
311
312        # 임베딩 + 스케일링 + 위치 인코딩
313        embedded = self.embedding(x) * math.sqrt(self.embed_dim)
314        embedded = self.pos_encoder(embedded)
315
316        # Transformer 인코더
317        output = self.transformer(embedded, src_key_padding_mask=padding_mask)
318
319        # 평균 풀링 (패딩 제외)
320        mask = (~padding_mask).unsqueeze(-1).float()
321        pooled = (output * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
322
323        return self.fc(pooled)
324
325transformer_model = TransformerClassifier(
326    len(vocab), embed_dim=64, num_heads=4, num_layers=2, num_classes=2
327)
328print(f"TransformerClassifier 파라미터: {sum(p.numel() for p in transformer_model.parameters()):,}")
329
330
331# ============================================
332# 8. 학습 함수
333# ============================================
334print("\n[8] 학습 파이프라인")
335print("-" * 40)
336
337def train_epoch(model, loader, criterion, optimizer, device):
338    model.train()
339    total_loss = 0
340    correct = 0
341    total = 0
342
343    for texts, labels in loader:
344        texts, labels = texts.to(device), labels.to(device)
345
346        optimizer.zero_grad()
347        outputs = model(texts)
348        loss = criterion(outputs, labels)
349        loss.backward()
350
351        # 기울기 클리핑 (RNN에 중요)
352        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
353
354        optimizer.step()
355
356        total_loss += loss.item()
357        pred = outputs.argmax(dim=1)
358        correct += (pred == labels).sum().item()
359        total += labels.size(0)
360
361    return total_loss / len(loader), 100. * correct / total
362
363
364def evaluate(model, loader, criterion, device):
365    model.eval()
366    total_loss = 0
367    correct = 0
368    total = 0
369
370    with torch.no_grad():
371        for texts, labels in loader:
372            texts, labels = texts.to(device), labels.to(device)
373            outputs = model(texts)
374            loss = criterion(outputs, labels)
375
376            total_loss += loss.item()
377            pred = outputs.argmax(dim=1)
378            correct += (pred == labels).sum().item()
379            total += labels.size(0)
380
381    return total_loss / len(loader), 100. * correct / total
382
383
384def train_model(model, train_loader, test_loader, epochs=10, lr=1e-3):
385    model = model.to(device)
386    criterion = nn.CrossEntropyLoss()
387    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
388
389    history = {'train_loss': [], 'train_acc': [], 'test_loss': [], 'test_acc': []}
390
391    for epoch in range(epochs):
392        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
393        test_loss, test_acc = evaluate(model, test_loader, criterion, device)
394
395        history['train_loss'].append(train_loss)
396        history['train_acc'].append(train_acc)
397        history['test_loss'].append(test_loss)
398        history['test_acc'].append(test_acc)
399
400        if (epoch + 1) % 5 == 0 or epoch == 0:
401            print(f"Epoch {epoch+1:2d}: Train Loss={train_loss:.4f}, Acc={train_acc:.1f}% | "
402                  f"Test Loss={test_loss:.4f}, Acc={test_acc:.1f}%")
403
404    return history
405
406
407# ============================================
408# 9. 모델 비교 학습
409# ============================================
410print("\n[9] 모델 비교 학습")
411print("-" * 40)
412
413# 모델 재생성 (학습 전 상태)
414models = {
415    'Simple': SimpleClassifier(len(vocab), embed_dim=64, num_classes=2),
416    'LSTM': LSTMClassifier(len(vocab), embed_dim=64, hidden_dim=128, num_classes=2),
417    'Transformer': TransformerClassifier(len(vocab), embed_dim=64, num_heads=4,
418                                          num_layers=2, num_classes=2)
419}
420
421results = {}
422for name, model in models.items():
423    print(f"\n--- {name} 학습 ---")
424    history = train_model(model, train_loader, test_loader, epochs=15)
425    results[name] = history
426    print(f"{name} 최종 테스트 정확도: {history['test_acc'][-1]:.1f}%")
427
428
429# ============================================
430# 10. 결과 시각화
431# ============================================
432print("\n[10] 결과 시각화")
433print("-" * 40)
434
435fig, axes = plt.subplots(1, 2, figsize=(12, 4))
436
437# 정확도
438for name, history in results.items():
439    axes[0].plot(history['test_acc'], label=f"{name} (final={history['test_acc'][-1]:.1f}%)")
440axes[0].set_xlabel('Epoch')
441axes[0].set_ylabel('Test Accuracy (%)')
442axes[0].set_title('Model Comparison - Accuracy')
443axes[0].legend()
444axes[0].grid(True, alpha=0.3)
445
446# 손실
447for name, history in results.items():
448    axes[1].plot(history['test_loss'], label=name)
449axes[1].set_xlabel('Epoch')
450axes[1].set_ylabel('Test Loss')
451axes[1].set_title('Model Comparison - Loss')
452axes[1].legend()
453axes[1].grid(True, alpha=0.3)
454
455plt.tight_layout()
456plt.savefig('text_classification_comparison.png', dpi=100)
457plt.close()
458print("그래프 저장: text_classification_comparison.png")
459
460
461# ============================================
462# 11. 추론 함수
463# ============================================
464print("\n[11] 추론 테스트")
465print("-" * 40)
466
467def predict_sentiment(model, text, vocab, tokenizer, device):
468    """텍스트 감성 예측"""
469    model.eval()
470    encoded = vocab.encode(text, tokenizer, max_len=50)
471    tensor = torch.tensor(encoded).unsqueeze(0).to(device)
472
473    with torch.no_grad():
474        output = model(tensor)
475        prob = F.softmax(output, dim=1)
476        pred = output.argmax(dim=1).item()
477
478    sentiment = 'Positive' if pred == 1 else 'Negative'
479    confidence = prob[0, pred].item()
480
481    return sentiment, confidence
482
483# 테스트 문장
484test_sentences = [
485    "This product is amazing and I love it",
486    "Terrible quality waste of money",
487    "It's okay nothing special",
488    "Best purchase ever highly recommended",
489    "Very disappointed will not buy again",
490]
491
492# LSTM 모델로 예측
493lstm_model = models['LSTM']
494print("\nLSTM 모델 예측:")
495for sentence in test_sentences:
496    sentiment, conf = predict_sentiment(lstm_model, sentence, vocab, simple_tokenizer, device)
497    print(f"  [{sentiment:8s}] ({conf*100:5.1f}%) {sentence}")
498
499
500# ============================================
501# 12. Attention 시각화
502# ============================================
503print("\n[12] Attention 가중치 분석")
504print("-" * 40)
505
506class AttentionLSTM(nn.Module):
507    """Attention이 있는 LSTM"""
508    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
509        super().__init__()
510        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
511        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
512        self.attention = nn.Linear(hidden_dim * 2, 1)
513        self.fc = nn.Linear(hidden_dim * 2, num_classes)
514
515    def forward(self, x, return_attention=False):
516        embedded = self.embedding(x)
517        output, _ = self.lstm(embedded)
518
519        # Attention
520        attn_weights = torch.softmax(self.attention(output).squeeze(-1), dim=1)
521        context = (output * attn_weights.unsqueeze(-1)).sum(dim=1)
522
523        logits = self.fc(context)
524
525        if return_attention:
526            return logits, attn_weights
527        return logits
528
529attn_model = AttentionLSTM(len(vocab), embed_dim=64, hidden_dim=128, num_classes=2)
530print(f"AttentionLSTM 파라미터: {sum(p.numel() for p in attn_model.parameters()):,}")
531
532# 학습
533print("\nAttentionLSTM 학습:")
534attn_model = attn_model.to(device)
535history = train_model(attn_model, train_loader, test_loader, epochs=10)
536
537
538# Attention 시각화
539def visualize_attention(model, text, vocab, tokenizer, device):
540    model.eval()
541    tokens = tokenizer(text)
542    encoded = vocab.encode(text, tokenizer, max_len=len(tokens))
543    tensor = torch.tensor(encoded).unsqueeze(0).to(device)
544
545    with torch.no_grad():
546        logits, attn = model(tensor, return_attention=True)
547        pred = logits.argmax(dim=1).item()
548        prob = F.softmax(logits, dim=1)[0, pred].item()
549
550    attn = attn[0].cpu().numpy()[:len(tokens)]
551    sentiment = 'Positive' if pred == 1 else 'Negative'
552
553    return tokens, attn, sentiment, prob
554
555# 시각화
556sample_text = "This movie is absolutely amazing and wonderful"
557tokens, attn, sentiment, prob = visualize_attention(attn_model, sample_text, vocab, simple_tokenizer, device)
558
559print(f"\n문장: {sample_text}")
560print(f"예측: {sentiment} ({prob*100:.1f}%)")
561print("\nAttention 가중치:")
562for token, weight in zip(tokens, attn):
563    bar = '█' * int(weight * 50)
564    print(f"  {token:12s} {weight:.3f} {bar}")
565
566
567# ============================================
568# 정리
569# ============================================
570print("\n" + "=" * 60)
571print("텍스트 분류 정리")
572print("=" * 60)
573
574summary = """
575텍스트 분류 파이프라인:
576    1. 토큰화: 텍스트 → 단어 리스트
577    2. 어휘 구축: 단어 → 인덱스 매핑
578    3. 인코딩: 텍스트 → 텐서
579    4. 모델: 임베딩 → 인코더 → 분류
580
581모델 비교:
582    - Simple (임베딩 평균): 빠름, 간단
583    - LSTM: 순서 정보 활용, 안정적
584    - Transformer: 병렬화, 긴 시퀀스
585
586핵심 코드:
587    # 어휘 구축
588    vocab = Vocabulary(min_freq=2)
589    vocab.build(texts, tokenizer)
590
591    # LSTM 분류기
592    lstm = nn.LSTM(embed_dim, hidden_dim, bidirectional=True)
593    _, (h_n, _) = lstm(embedded)
594    combined = torch.cat([h_n[-2], h_n[-1]], dim=1)
595
596    # Transformer 분류기
597    encoder = nn.TransformerEncoder(encoder_layer, num_layers)
598    output = encoder(embedded, src_key_padding_mask=padding_mask)
599
600학습 팁:
601    - 기울기 클리핑 (RNN에 필수)
602    - Dropout (과적합 방지)
603    - 적절한 패딩 처리
604
605다음 단계:
606    - HuggingFace Transformers
607    - BERT/GPT 파인튜닝
608    - 대규모 데이터셋 (IMDb)
609"""
610print(summary)
611print("=" * 60)