01. NLP ๊ธฐ์ด
01. NLP ๊ธฐ์ด¶
ํ์ต ๋ชฉํ¶
- ํ ์คํธ ์ ์ฒ๋ฆฌ ๊ธฐ๋ฒ
- ํ ํฐํ ๋ฐฉ๋ฒ ์ดํด
- ์ดํ ๊ตฌ์ถ๊ณผ ์ธ์ฝ๋ฉ
- ํ ์คํธ ์ ๊ทํ
1. ํ ์คํธ ์ ์ฒ๋ฆฌ¶
์ ์ฒ๋ฆฌ ํ์ดํ๋ผ์ธ¶
์๋ณธ ํ
์คํธ
โ
์ ๊ทํ (์๋ฌธ์, ํน์๋ฌธ์ ์ ๊ฑฐ)
โ
ํ ํฐํ (๋จ์ด/์๋ธ์๋ ๋ถ๋ฆฌ)
โ
๋ถ์ฉ์ด ์ ๊ฑฐ (์ ํ)
โ
์ดํ ๊ตฌ์ถ
โ
์ธ์ฝ๋ฉ (ํ
์คํธ โ ์ซ์)
๊ธฐ๋ณธ ์ ์ฒ๋ฆฌ¶
import re
def preprocess(text):
# ์๋ฌธ์ ๋ณํ
text = text.lower()
# ํน์๋ฌธ์ ์ ๊ฑฐ
text = re.sub(r'[^\w\s]', '', text)
# ์ฌ๋ฌ ๊ณต๋ฐฑ์ ํ๋๋ก
text = re.sub(r'\s+', ' ', text).strip()
return text
text = "Hello, World! This is NLP processing."
print(preprocess(text))
# "hello world this is nlp processing"
2. ํ ํฐํ (Tokenization)¶
๋จ์ด ํ ํฐํ¶
# ๊ณต๋ฐฑ ๊ธฐ๋ฐ
text = "I love natural language processing"
tokens = text.split()
# ['I', 'love', 'natural', 'language', 'processing']
# NLTK
import nltk
from nltk.tokenize import word_tokenize
tokens = word_tokenize("I don't like it.")
# ['I', 'do', "n't", 'like', 'it', '.']
์๋ธ์๋ ํ ํฐํ¶
์๋ธ์๋๋ ๋จ์ด๋ฅผ ๋ ์์ ๋จ์๋ก ๋ถ๋ฆฌ
"unhappiness" โ ["un", "##happiness"] (WordPiece)
"unhappiness" โ ["un", "happi", "ness"] (BPE)
์ฅ์ : - ๋ฏธ๋ฑ๋ก ๋จ์ด(OOV) ์ฒ๋ฆฌ ๊ฐ๋ฅ - ์ดํ ํฌ๊ธฐ ์ถ์ - ํํ์ ์ ๋ณด ๋ณด์กด
BPE (Byte Pair Encoding)¶
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
# BPE ํ ํฌ๋์ด์ ์์ฑ
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
# ํ์ต
trainer = BpeTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]"])
tokenizer.train(files=["corpus.txt"], trainer=trainer)
# ํ ํฐํ
output = tokenizer.encode("Hello, world!")
print(output.tokens)
WordPiece (BERT)¶
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text = "I love natural language processing"
tokens = tokenizer.tokenize(text)
# ['i', 'love', 'natural', 'language', 'processing']
# ์ธ์ฝ๋ฉ
encoded = tokenizer.encode(text)
# [101, 1045, 2293, 3019, 2653, 6364, 102]
# ๋์ฝ๋ฉ
decoded = tokenizer.decode(encoded)
# "[CLS] i love natural language processing [SEP]"
SentencePiece (GPT, T5)¶
import sentencepiece as spm
# ํ์ต
spm.SentencePieceTrainer.train(
input='corpus.txt',
model_prefix='spm',
vocab_size=8000,
model_type='bpe'
)
# ๋ก๋ ๋ฐ ์ฌ์ฉ
sp = spm.SentencePieceProcessor()
sp.load('spm.model')
tokens = sp.encode_as_pieces("Hello, world!")
# ['โHello', ',', 'โworld', '!']
ids = sp.encode_as_ids("Hello, world!")
# [1234, 567, 890, 12]
3. ์ดํ ๊ตฌ์ถ (Vocabulary)¶
๊ธฐ๋ณธ ์ดํ ์ฌ์ ¶
from collections import Counter
class Vocabulary:
def __init__(self, min_freq=1):
self.word2idx = {'<pad>': 0, '<unk>': 1, '<bos>': 2, '<eos>': 3}
self.idx2word = {0: '<pad>', 1: '<unk>', 2: '<bos>', 3: '<eos>'}
self.word_freq = Counter()
self.min_freq = min_freq
def build(self, texts, tokenizer):
# ๋จ์ด ๋น๋ ๊ณ์ฐ
for text in texts:
tokens = tokenizer(text)
self.word_freq.update(tokens)
# ๋น๋ ๊ธฐ์ค ํํฐ๋ง ํ ์ถ๊ฐ
idx = len(self.word2idx)
for word, freq in self.word_freq.items():
if freq >= self.min_freq and word not in self.word2idx:
self.word2idx[word] = idx
self.idx2word[idx] = word
idx += 1
def encode(self, text, tokenizer):
tokens = tokenizer(text)
return [self.word2idx.get(t, self.word2idx['<unk>']) for t in tokens]
def decode(self, indices):
return [self.idx2word.get(i, '<unk>') for i in indices]
def __len__(self):
return len(self.word2idx)
# ์ฌ์ฉ
vocab = Vocabulary(min_freq=2)
vocab.build(texts, str.split)
encoded = vocab.encode("hello world", str.split)
torchtext ์ดํ¶
from torchtext.vocab import build_vocab_from_iterator
def yield_tokens(data_iter, tokenizer):
for text in data_iter:
yield tokenizer(text)
vocab = build_vocab_from_iterator(
yield_tokens(texts, tokenizer),
specials=['<pad>', '<unk>'],
min_freq=2
)
vocab.set_default_index(vocab['<unk>'])
# ์ฌ์ฉ
indices = vocab(tokenizer("hello world"))
4. ํจ๋ฉ๊ณผ ๋ฐฐ์น ์ฒ๋ฆฌ¶
์ํ์ค ํจ๋ฉ¶
import torch
from torch.nn.utils.rnn import pad_sequence
def collate_fn(batch):
texts, labels = zip(*batch)
# ํ ํฐํ ๋ฐ ์ธ์ฝ๋ฉ
encoded = [torch.tensor(vocab.encode(t, tokenizer)) for t in texts]
# ํจ๋ฉ (๊ฐ์ฅ ๊ธด ์ํ์ค์ ๋ง์ถค)
padded = pad_sequence(encoded, batch_first=True, padding_value=0)
# ์ต๋ ๊ธธ์ด ์ ํ
if padded.size(1) > max_len:
padded = padded[:, :max_len]
labels = torch.tensor(labels)
return padded, labels
# DataLoader์ ์ ์ฉ
from torch.utils.data import DataLoader
loader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)
Attention Mask¶
def create_attention_mask(input_ids, pad_token_id=0):
"""ํจ๋ฉ์ด ์๋ ์์น๋ 1, ํจ๋ฉ์ 0"""
return (input_ids != pad_token_id).long()
# ์์
input_ids = torch.tensor([[1, 2, 3, 0, 0], [4, 5, 0, 0, 0]])
attention_mask = create_attention_mask(input_ids)
# tensor([[1, 1, 1, 0, 0], [1, 1, 0, 0, 0]])
5. ํ ์คํธ ์ ๊ทํ¶
๋ค์ํ ์ ๊ทํ ๊ธฐ๋ฒ¶
import unicodedata
def normalize_text(text):
# Unicode ์ ๊ทํ (NFD โ NFC)
text = unicodedata.normalize('NFC', text)
# ์๋ฌธ์ ๋ณํ
text = text.lower()
# URL ์ ๊ฑฐ
text = re.sub(r'http\S+', '', text)
# ์ด๋ฉ์ผ ์ ๊ฑฐ
text = re.sub(r'\S+@\S+', '', text)
# ์ซ์ ์ ๊ทํ (์ ํ)
text = re.sub(r'\d+', '<NUM>', text)
# ๋ฐ๋ณต ๋ฌธ์ ์ถ์
text = re.sub(r'(.)\1{2,}', r'\1\1', text) # "sooooo" โ "soo"
return text.strip()
๋ถ์ฉ์ด ์ ๊ฑฐ¶
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(tokens):
return [t for t in tokens if t.lower() not in stop_words]
tokens = ['this', 'is', 'a', 'test', 'sentence']
filtered = remove_stopwords(tokens)
# ['test', 'sentence']
ํ์ ์ด ์ถ์ถ (Lemmatization)¶
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
words = ['running', 'runs', 'ran', 'better', 'cats']
lemmas = [lemmatizer.lemmatize(w) for w in words]
# ['running', 'run', 'ran', 'better', 'cat']
6. HuggingFace ํ ํฌ๋์ด์ ¶
๊ธฐ๋ณธ ์ฌ์ฉ¶
from transformers import AutoTokenizer
# ํ ํฌ๋์ด์ ๋ก๋
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# ์ธ์ฝ๋ฉ
text = "Hello, how are you?"
encoded = tokenizer(
text,
padding='max_length',
truncation=True,
max_length=32,
return_tensors='pt'
)
print(encoded['input_ids'].shape) # torch.Size([1, 32])
print(encoded['attention_mask'].shape) # torch.Size([1, 32])
๋ฐฐ์น ์ธ์ฝ๋ฉ¶
texts = ["Hello world", "NLP is fun", "I love Python"]
encoded = tokenizer(
texts,
padding=True,
truncation=True,
max_length=16,
return_tensors='pt'
)
print(encoded['input_ids'].shape) # torch.Size([3, 16])
ํน์ ํ ํฐ¶
# BERT ํน์ ํ ํฐ
print(tokenizer.special_tokens_map)
# {'unk_token': '[UNK]', 'sep_token': '[SEP]',
# 'pad_token': '[PAD]', 'cls_token': '[CLS]',
# 'mask_token': '[MASK]'}
# ํ ํฐ ID
print(tokenizer.cls_token_id) # 101
print(tokenizer.sep_token_id) # 102
print(tokenizer.pad_token_id) # 0
7. ์ค์ต: ํ ์คํธ ๋ถ๋ฅ ์ ์ฒ๋ฆฌ¶
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
class TextClassificationDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_length=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
encoding = self.tokenizer(
text,
truncation=True,
padding='max_length',
max_length=self.max_length,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].squeeze(),
'attention_mask': encoding['attention_mask'].squeeze(),
'label': torch.tensor(label)
}
# ์ฌ์ฉ
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
dataset = TextClassificationDataset(texts, labels, tokenizer)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
for batch in loader:
input_ids = batch['input_ids'] # (32, 128)
attention_mask = batch['attention_mask'] # (32, 128)
labels = batch['label'] # (32,)
break
์ ๋ฆฌ¶
ํ ํฐํ ๋ฐฉ๋ฒ ๋น๊ต¶
| ๋ฐฉ๋ฒ | ์ฅ์ | ๋จ์ | ์ฌ์ฉ ๋ชจ๋ธ |
|---|---|---|---|
| ๋จ์ด ๋จ์ | ์ง๊ด์ | OOV ๋ฌธ์ | ์ ํต NLP |
| BPE | OOV ํด๊ฒฐ | ํ์ต ํ์ | GPT |
| WordPiece | OOV ํด๊ฒฐ | ํ์ต ํ์ | BERT |
| SentencePiece | ์ธ์ด ๋ฌด๊ด | ํ์ต ํ์ | T5, GPT |
ํต์ฌ ์ฝ๋¶
# HuggingFace ํ ํฌ๋์ด์
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
encoded = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
# ์ดํ ๊ตฌ์ถ
vocab = build_vocab_from_iterator(yield_tokens(texts), specials=['<pad>', '<unk>'])
# ํจ๋ฉ
padded = pad_sequence(sequences, batch_first=True, padding_value=0)
๋ค์ ๋จ๊ณ¶
02_Word2Vec_GloVe.md์์ ๋จ์ด ์๋ฒ ๋ฉ์ ํ์ตํฉ๋๋ค.