01. NLP ๊ธฐ์ดˆ

01. NLP ๊ธฐ์ดˆ

ํ•™์Šต ๋ชฉํ‘œ

  • ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ ๊ธฐ๋ฒ•
  • ํ† ํฐํ™” ๋ฐฉ๋ฒ• ์ดํ•ด
  • ์–ดํœ˜ ๊ตฌ์ถ•๊ณผ ์ธ์ฝ”๋”ฉ
  • ํ…์ŠคํŠธ ์ •๊ทœํ™”

1. ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ

์ „์ฒ˜๋ฆฌ ํŒŒ์ดํ”„๋ผ์ธ

์›๋ณธ ํ…์ŠคํŠธ
    โ†“
์ •๊ทœํ™” (์†Œ๋ฌธ์ž, ํŠน์ˆ˜๋ฌธ์ž ์ œ๊ฑฐ)
    โ†“
ํ† ํฐํ™” (๋‹จ์–ด/์„œ๋ธŒ์›Œ๋“œ ๋ถ„๋ฆฌ)
    โ†“
๋ถˆ์šฉ์–ด ์ œ๊ฑฐ (์„ ํƒ)
    โ†“
์–ดํœ˜ ๊ตฌ์ถ•
    โ†“
์ธ์ฝ”๋”ฉ (ํ…์ŠคํŠธ โ†’ ์ˆซ์ž)

๊ธฐ๋ณธ ์ „์ฒ˜๋ฆฌ

import re

def preprocess(text):
    # ์†Œ๋ฌธ์ž ๋ณ€ํ™˜
    text = text.lower()

    # ํŠน์ˆ˜๋ฌธ์ž ์ œ๊ฑฐ
    text = re.sub(r'[^\w\s]', '', text)

    # ์—ฌ๋Ÿฌ ๊ณต๋ฐฑ์„ ํ•˜๋‚˜๋กœ
    text = re.sub(r'\s+', ' ', text).strip()

    return text

text = "Hello, World! This is NLP   processing."
print(preprocess(text))
# "hello world this is nlp processing"

2. ํ† ํฐํ™” (Tokenization)

๋‹จ์–ด ํ† ํฐํ™”

# ๊ณต๋ฐฑ ๊ธฐ๋ฐ˜
text = "I love natural language processing"
tokens = text.split()
# ['I', 'love', 'natural', 'language', 'processing']

# NLTK
import nltk
from nltk.tokenize import word_tokenize
tokens = word_tokenize("I don't like it.")
# ['I', 'do', "n't", 'like', 'it', '.']

์„œ๋ธŒ์›Œ๋“œ ํ† ํฐํ™”

์„œ๋ธŒ์›Œ๋“œ๋Š” ๋‹จ์–ด๋ฅผ ๋” ์ž‘์€ ๋‹จ์œ„๋กœ ๋ถ„๋ฆฌ

"unhappiness" โ†’ ["un", "##happiness"] (WordPiece)
"unhappiness" โ†’ ["un", "happi", "ness"] (BPE)

์žฅ์ : - ๋ฏธ๋“ฑ๋ก ๋‹จ์–ด(OOV) ์ฒ˜๋ฆฌ ๊ฐ€๋Šฅ - ์–ดํœ˜ ํฌ๊ธฐ ์ถ•์†Œ - ํ˜•ํƒœ์†Œ ์ •๋ณด ๋ณด์กด

BPE (Byte Pair Encoding)

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

# BPE ํ† ํฌ๋‚˜์ด์ € ์ƒ์„ฑ
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# ํ•™์Šต
trainer = BpeTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]"])
tokenizer.train(files=["corpus.txt"], trainer=trainer)

# ํ† ํฐํ™”
output = tokenizer.encode("Hello, world!")
print(output.tokens)

WordPiece (BERT)

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

text = "I love natural language processing"
tokens = tokenizer.tokenize(text)
# ['i', 'love', 'natural', 'language', 'processing']

# ์ธ์ฝ”๋”ฉ
encoded = tokenizer.encode(text)
# [101, 1045, 2293, 3019, 2653, 6364, 102]

# ๋””์ฝ”๋”ฉ
decoded = tokenizer.decode(encoded)
# "[CLS] i love natural language processing [SEP]"

SentencePiece (GPT, T5)

import sentencepiece as spm

# ํ•™์Šต
spm.SentencePieceTrainer.train(
    input='corpus.txt',
    model_prefix='spm',
    vocab_size=8000,
    model_type='bpe'
)

# ๋กœ๋“œ ๋ฐ ์‚ฌ์šฉ
sp = spm.SentencePieceProcessor()
sp.load('spm.model')

tokens = sp.encode_as_pieces("Hello, world!")
# ['โ–Hello', ',', 'โ–world', '!']

ids = sp.encode_as_ids("Hello, world!")
# [1234, 567, 890, 12]

3. ์–ดํœ˜ ๊ตฌ์ถ• (Vocabulary)

๊ธฐ๋ณธ ์–ดํœ˜ ์‚ฌ์ „

from collections import Counter

class Vocabulary:
    def __init__(self, min_freq=1):
        self.word2idx = {'<pad>': 0, '<unk>': 1, '<bos>': 2, '<eos>': 3}
        self.idx2word = {0: '<pad>', 1: '<unk>', 2: '<bos>', 3: '<eos>'}
        self.word_freq = Counter()
        self.min_freq = min_freq

    def build(self, texts, tokenizer):
        # ๋‹จ์–ด ๋นˆ๋„ ๊ณ„์‚ฐ
        for text in texts:
            tokens = tokenizer(text)
            self.word_freq.update(tokens)

        # ๋นˆ๋„ ๊ธฐ์ค€ ํ•„ํ„ฐ๋ง ํ›„ ์ถ”๊ฐ€
        idx = len(self.word2idx)
        for word, freq in self.word_freq.items():
            if freq >= self.min_freq and word not in self.word2idx:
                self.word2idx[word] = idx
                self.idx2word[idx] = word
                idx += 1

    def encode(self, text, tokenizer):
        tokens = tokenizer(text)
        return [self.word2idx.get(t, self.word2idx['<unk>']) for t in tokens]

    def decode(self, indices):
        return [self.idx2word.get(i, '<unk>') for i in indices]

    def __len__(self):
        return len(self.word2idx)

# ์‚ฌ์šฉ
vocab = Vocabulary(min_freq=2)
vocab.build(texts, str.split)
encoded = vocab.encode("hello world", str.split)

torchtext ์–ดํœ˜

from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data_iter, tokenizer):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(
    yield_tokens(texts, tokenizer),
    specials=['<pad>', '<unk>'],
    min_freq=2
)
vocab.set_default_index(vocab['<unk>'])

# ์‚ฌ์šฉ
indices = vocab(tokenizer("hello world"))

4. ํŒจ๋”ฉ๊ณผ ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ

์‹œํ€€์Šค ํŒจ๋”ฉ

import torch
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    texts, labels = zip(*batch)

    # ํ† ํฐํ™” ๋ฐ ์ธ์ฝ”๋”ฉ
    encoded = [torch.tensor(vocab.encode(t, tokenizer)) for t in texts]

    # ํŒจ๋”ฉ (๊ฐ€์žฅ ๊ธด ์‹œํ€€์Šค์— ๋งž์ถค)
    padded = pad_sequence(encoded, batch_first=True, padding_value=0)

    # ์ตœ๋Œ€ ๊ธธ์ด ์ œํ•œ
    if padded.size(1) > max_len:
        padded = padded[:, :max_len]

    labels = torch.tensor(labels)
    return padded, labels

# DataLoader์— ์ ์šฉ
from torch.utils.data import DataLoader
loader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)

Attention Mask

def create_attention_mask(input_ids, pad_token_id=0):
    """ํŒจ๋”ฉ์ด ์•„๋‹Œ ์œ„์น˜๋Š” 1, ํŒจ๋”ฉ์€ 0"""
    return (input_ids != pad_token_id).long()

# ์˜ˆ์‹œ
input_ids = torch.tensor([[1, 2, 3, 0, 0], [4, 5, 0, 0, 0]])
attention_mask = create_attention_mask(input_ids)
# tensor([[1, 1, 1, 0, 0], [1, 1, 0, 0, 0]])

5. ํ…์ŠคํŠธ ์ •๊ทœํ™”

๋‹ค์–‘ํ•œ ์ •๊ทœํ™” ๊ธฐ๋ฒ•

import unicodedata

def normalize_text(text):
    # Unicode ์ •๊ทœํ™” (NFD โ†’ NFC)
    text = unicodedata.normalize('NFC', text)

    # ์†Œ๋ฌธ์ž ๋ณ€ํ™˜
    text = text.lower()

    # URL ์ œ๊ฑฐ
    text = re.sub(r'http\S+', '', text)

    # ์ด๋ฉ”์ผ ์ œ๊ฑฐ
    text = re.sub(r'\S+@\S+', '', text)

    # ์ˆซ์ž ์ •๊ทœํ™” (์„ ํƒ)
    text = re.sub(r'\d+', '<NUM>', text)

    # ๋ฐ˜๋ณต ๋ฌธ์ž ์ถ•์†Œ
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)  # "sooooo" โ†’ "soo"

    return text.strip()

๋ถˆ์šฉ์–ด ์ œ๊ฑฐ

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [t for t in tokens if t.lower() not in stop_words]

tokens = ['this', 'is', 'a', 'test', 'sentence']
filtered = remove_stopwords(tokens)
# ['test', 'sentence']

ํ‘œ์ œ์–ด ์ถ”์ถœ (Lemmatization)

from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

words = ['running', 'runs', 'ran', 'better', 'cats']
lemmas = [lemmatizer.lemmatize(w) for w in words]
# ['running', 'run', 'ran', 'better', 'cat']

6. HuggingFace ํ† ํฌ๋‚˜์ด์ €

๊ธฐ๋ณธ ์‚ฌ์šฉ

from transformers import AutoTokenizer

# ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# ์ธ์ฝ”๋”ฉ
text = "Hello, how are you?"
encoded = tokenizer(
    text,
    padding='max_length',
    truncation=True,
    max_length=32,
    return_tensors='pt'
)

print(encoded['input_ids'].shape)      # torch.Size([1, 32])
print(encoded['attention_mask'].shape) # torch.Size([1, 32])

๋ฐฐ์น˜ ์ธ์ฝ”๋”ฉ

texts = ["Hello world", "NLP is fun", "I love Python"]

encoded = tokenizer(
    texts,
    padding=True,
    truncation=True,
    max_length=16,
    return_tensors='pt'
)

print(encoded['input_ids'].shape)  # torch.Size([3, 16])

ํŠน์ˆ˜ ํ† ํฐ

# BERT ํŠน์ˆ˜ ํ† ํฐ
print(tokenizer.special_tokens_map)
# {'unk_token': '[UNK]', 'sep_token': '[SEP]',
#  'pad_token': '[PAD]', 'cls_token': '[CLS]',
#  'mask_token': '[MASK]'}

# ํ† ํฐ ID
print(tokenizer.cls_token_id)  # 101
print(tokenizer.sep_token_id)  # 102
print(tokenizer.pad_token_id)  # 0

7. ์‹ค์Šต: ํ…์ŠคํŠธ ๋ถ„๋ฅ˜ ์ „์ฒ˜๋ฆฌ

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label)
        }

# ์‚ฌ์šฉ
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
dataset = TextClassificationDataset(texts, labels, tokenizer)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

for batch in loader:
    input_ids = batch['input_ids']       # (32, 128)
    attention_mask = batch['attention_mask']  # (32, 128)
    labels = batch['label']              # (32,)
    break

์ •๋ฆฌ

ํ† ํฐํ™” ๋ฐฉ๋ฒ• ๋น„๊ต

๋ฐฉ๋ฒ• ์žฅ์  ๋‹จ์  ์‚ฌ์šฉ ๋ชจ๋ธ
๋‹จ์–ด ๋‹จ์œ„ ์ง๊ด€์  OOV ๋ฌธ์ œ ์ „ํ†ต NLP
BPE OOV ํ•ด๊ฒฐ ํ•™์Šต ํ•„์š” GPT
WordPiece OOV ํ•ด๊ฒฐ ํ•™์Šต ํ•„์š” BERT
SentencePiece ์–ธ์–ด ๋ฌด๊ด€ ํ•™์Šต ํ•„์š” T5, GPT

ํ•ต์‹ฌ ์ฝ”๋“œ

# HuggingFace ํ† ํฌ๋‚˜์ด์ €
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
encoded = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

# ์–ดํœ˜ ๊ตฌ์ถ•
vocab = build_vocab_from_iterator(yield_tokens(texts), specials=['<pad>', '<unk>'])

# ํŒจ๋”ฉ
padded = pad_sequence(sequences, batch_first=True, padding_value=0)

๋‹ค์Œ ๋‹จ๊ณ„

02_Word2Vec_GloVe.md์—์„œ ๋‹จ์–ด ์ž„๋ฒ ๋”ฉ์„ ํ•™์Šตํ•ฉ๋‹ˆ๋‹ค.

to navigate between lessons