40. Practical Text Classification Project
Previous: Practical Image Classification Project | Next: Model Saving and Deployment
40. Practical Text Classification Project¶
Learning Objectives¶
- Text preprocessing and tokenization
- Using embedding layers
- LSTM/Transformer-based classifiers
- Sentiment analysis project
1. Text Preprocessing¶
Tokenization¶
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english')
text = "This is a sample sentence!"
tokens = tokenizer(text)
# ['this', 'is', 'a', 'sample', 'sentence', '!']
Building Vocabulary¶
from torchtext.vocab import build_vocab_from_iterator
def yield_tokens(data_iter):
for text, _ in data_iter:
yield tokenizer(text)
vocab = build_vocab_from_iterator(
yield_tokens(train_data),
specials=['<unk>', '<pad>'],
min_freq=5
)
vocab.set_default_index(vocab['<unk>'])
Text β Tensor¶
def text_pipeline(text):
return [vocab[token] for token in tokenizer(text)]
def collate_fn(batch):
texts, labels = zip(*batch)
# ν ν°ν λ° ν¨λ©
encoded = [torch.tensor(text_pipeline(t)) for t in texts]
padded = nn.utils.rnn.pad_sequence(encoded, batch_first=True)
labels = torch.tensor(labels)
return padded, labels
2. Embedding Layer¶
Basic Embedding¶
class TextClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, num_classes):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.fc = nn.Linear(embed_dim, num_classes)
def forward(self, x):
# x: (batch, seq_len)
embedded = self.embedding(x) # (batch, seq, embed)
# νκ· νλ§
pooled = embedded.mean(dim=1)
return self.fc(pooled)
Pre-trained Embeddings (GloVe)¶
from torchtext.vocab import GloVe
glove = GloVe(name='6B', dim=100)
# μλ² λ© νλ ¬ μμ±
embedding_matrix = torch.zeros(len(vocab), 100)
for i, word in enumerate(vocab.get_itos()):
if word in glove.stoi:
embedding_matrix[i] = glove[word]
# λͺ¨λΈμ μ μ©
model.embedding.weight = nn.Parameter(embedding_matrix)
model.embedding.weight.requires_grad = False # κ³ μ λλ λ―ΈμΈμ‘°μ
3. LSTM Classifier¶
class LSTMClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes,
num_layers=2, bidirectional=True, dropout=0.5):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.lstm = nn.LSTM(
embed_dim, hidden_dim,
num_layers=num_layers,
batch_first=True,
bidirectional=bidirectional,
dropout=dropout if num_layers > 1 else 0
)
hidden_size = hidden_dim * 2 if bidirectional else hidden_dim
self.fc = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(hidden_size, hidden_size // 2),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_size // 2, num_classes)
)
def forward(self, x):
# x: (batch, seq)
embedded = self.embedding(x)
output, (hidden, _) = self.lstm(embedded)
# μλ°©ν₯: λ§μ§λ§ μ λ°©ν₯ + λ§μ§λ§ μλ°©ν₯
if self.lstm.bidirectional:
hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
else:
hidden = hidden[-1]
return self.fc(hidden)
4. Transformer Classifier¶
class TransformerClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, num_heads, num_layers,
num_classes, max_len=512, dropout=0.1):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.pos_encoder = PositionalEncoding(embed_dim, max_len, dropout)
encoder_layer = nn.TransformerEncoderLayer(
d_model=embed_dim,
nhead=num_heads,
dim_feedforward=embed_dim * 4,
dropout=dropout,
batch_first=True
)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
self.fc = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(embed_dim, num_classes)
)
def forward(self, x, mask=None):
# ν¨λ© λ§μ€ν¬
padding_mask = (x == 0)
embedded = self.embedding(x) * math.sqrt(self.embedding.embedding_dim)
embedded = self.pos_encoder(embedded)
output = self.transformer(embedded, src_key_padding_mask=padding_mask)
# [CLS] ν ν° λλ νκ· νλ§
pooled = output.mean(dim=1)
return self.fc(pooled)
5. Sentiment Analysis Dataset¶
IMDb¶
from torchtext.datasets import IMDB
train_data, test_data = IMDB(split=('train', 'test'))
# λΌλ²¨: 'pos' β 1, 'neg' β 0
def label_pipeline(label):
return 1 if label == 'pos' else 0
Data Loader¶
def collate_batch(batch):
labels, texts = [], []
for label, text in batch:
labels.append(label_pipeline(label))
processed = torch.tensor(text_pipeline(text), dtype=torch.long)
texts.append(processed)
labels = torch.tensor(labels)
texts = nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=0)
# μ΅λ κΈΈμ΄ μ ν
if texts.size(1) > 256:
texts = texts[:, :256]
return texts, labels
train_loader = DataLoader(train_data, batch_size=32, shuffle=True,
collate_fn=collate_batch)
6. Training Pipeline¶
def train_text_classifier():
# λͺ¨λΈ
model = LSTMClassifier(
vocab_size=len(vocab),
embed_dim=128,
hidden_dim=256,
num_classes=2
).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# νμ΅
for epoch in range(epochs):
model.train()
total_loss = 0
correct = 0
total = 0
for texts, labels in train_loader:
texts, labels = texts.to(device), labels.to(device)
optimizer.zero_grad()
output = model(texts)
loss = criterion(output, labels)
loss.backward()
# κΈ°μΈκΈ° ν΄λ¦¬ν (RNNμ μ€μ)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
total_loss += loss.item()
pred = output.argmax(dim=1)
correct += (pred == labels).sum().item()
total += labels.size(0)
train_acc = 100. * correct / total
print(f"Epoch {epoch+1}: Loss={total_loss/len(train_loader):.4f}, "
f"Acc={train_acc:.2f}%")
7. Inference¶
def predict_sentiment(model, text, vocab, tokenizer):
model.eval()
tokens = [vocab[t] for t in tokenizer(text.lower())]
tensor = torch.tensor(tokens).unsqueeze(0).to(device)
with torch.no_grad():
output = model(tensor)
prob = F.softmax(output, dim=1)
pred = output.argmax(dim=1).item()
sentiment = 'Positive' if pred == 1 else 'Negative'
confidence = prob[0, pred].item()
return sentiment, confidence
# μ¬μ©
text = "This movie was absolutely amazing! I loved every minute of it."
sentiment, conf = predict_sentiment(model, text, vocab, tokenizer)
print(f"{sentiment} ({conf*100:.1f}%)")
8. Using Hugging Face¶
BERT Classifier¶
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
# ν ν¬λμ΄μ μ λͺ¨λΈ
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
'bert-base-uncased', num_labels=2
)
# λ°μ΄ν° μ μ²λ¦¬
def tokenize_function(examples):
return tokenizer(examples['text'], padding='max_length',
truncation=True, max_length=256)
# νμ΅
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=16,
learning_rate=2e-5,
warmup_steps=500,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
trainer.train()
Summary¶
Text Classification Checklist¶
- [ ] Tokenization and vocabulary building
- [ ] Padding handling
- [ ] Embeddings (trained or pre-trained)
- [ ] Model selection (LSTM/Transformer)
- [ ] Gradient clipping
- [ ] Evaluation and inference
Model Selection Guide¶
| Model | Advantages | Disadvantages |
|---|---|---|
| LSTM | Simple implementation, fast training | Difficult with long sequences |
| Transformer | Parallelization, long sequences | High memory requirements |
| BERT (transfer learning) | Best performance | Slow, heavy |
Expected Accuracy (IMDb)¶
| Model | Accuracy |
|---|---|
| LSTM | 85-88% |
| Transformer | 87-90% |
| BERT (fine-tuned) | 93-95% |
Conclusion¶
This completes the Deep Learning learning course!
Learning Summary¶
- Basics (01-04): Tensors, neural networks, backpropagation, training techniques
- CNN (05-07): Convolution, ResNet, transfer learning
- Sequences (08-10): RNN, LSTM, Transformer
- Practical (11-14): Optimization, deployment, projects
Recommended Next Steps¶
- Study large language models in the LLM_and_NLP folder
- Apply to real projects
- Participate in Kaggle competitions