01_tokenization.py

  1"""
  201. NLP 기초 - 토큰화 예제
  3
  4텍스트 전처리와 토큰화 기법 실습
  5"""
  6
  7import re
  8from collections import Counter
  9
 10print("=" * 60)
 11print("NLP 기초: 토큰화")
 12print("=" * 60)
 13
 14
 15# ============================================
 16# 1. 기본 전처리
 17# ============================================
 18print("\n[1] 기본 전처리")
 19print("-" * 40)
 20
 21def preprocess(text):
 22    """기본 텍스트 전처리"""
 23    # 소문자 변환
 24    text = text.lower()
 25    # 특수문자 제거
 26    text = re.sub(r'[^\w\s]', '', text)
 27    # 다중 공백 정리
 28    text = re.sub(r'\s+', ' ', text).strip()
 29    return text
 30
 31sample = "Hello, World! This is NLP   processing."
 32cleaned = preprocess(sample)
 33print(f"원본: {sample}")
 34print(f"전처리: {cleaned}")
 35
 36
 37# ============================================
 38# 2. 단어 토큰화
 39# ============================================
 40print("\n[2] 단어 토큰화")
 41print("-" * 40)
 42
 43def simple_tokenize(text):
 44    """공백 기반 토큰화"""
 45    return text.lower().split()
 46
 47text = "I love natural language processing"
 48tokens = simple_tokenize(text)
 49print(f"텍스트: {text}")
 50print(f"토큰: {tokens}")
 51
 52# NLTK 토큰화 (설치 필요: pip install nltk)
 53try:
 54    import nltk
 55    nltk.download('punkt', quiet=True)
 56    from nltk.tokenize import word_tokenize
 57
 58    text2 = "I don't like it. It's not good!"
 59    nltk_tokens = word_tokenize(text2)
 60    print(f"\nNLTK 토큰화: {nltk_tokens}")
 61except ImportError:
 62    print("\nNLTK 미설치 (pip install nltk)")
 63
 64
 65# ============================================
 66# 3. 어휘 사전 구축
 67# ============================================
 68print("\n[3] 어휘 사전 구축")
 69print("-" * 40)
 70
 71class Vocabulary:
 72    def __init__(self, min_freq=1):
 73        self.word2idx = {'<pad>': 0, '<unk>': 1, '<bos>': 2, '<eos>': 3}
 74        self.idx2word = {0: '<pad>', 1: '<unk>', 2: '<bos>', 3: '<eos>'}
 75        self.word_freq = Counter()
 76        self.min_freq = min_freq
 77
 78    def build(self, texts):
 79        """텍스트 리스트로 어휘 구축"""
 80        for text in texts:
 81            tokens = simple_tokenize(text)
 82            self.word_freq.update(tokens)
 83
 84        idx = len(self.word2idx)
 85        for word, freq in self.word_freq.items():
 86            if freq >= self.min_freq and word not in self.word2idx:
 87                self.word2idx[word] = idx
 88                self.idx2word[idx] = word
 89                idx += 1
 90
 91    def encode(self, text):
 92        """텍스트를 인덱스로 변환"""
 93        tokens = simple_tokenize(text)
 94        return [self.word2idx.get(t, self.word2idx['<unk>']) for t in tokens]
 95
 96    def decode(self, indices):
 97        """인덱스를 토큰으로 변환"""
 98        return [self.idx2word.get(i, '<unk>') for i in indices]
 99
100    def __len__(self):
101        return len(self.word2idx)
102
103# 어휘 구축
104texts = [
105    "I love machine learning",
106    "Machine learning is amazing",
107    "Deep learning is a subset of machine learning",
108    "I love deep learning"
109]
110
111vocab = Vocabulary(min_freq=1)
112vocab.build(texts)
113
114print(f"어휘 크기: {len(vocab)}")
115print(f"상위 빈도 단어: {vocab.word_freq.most_common(5)}")
116
117# 인코딩/디코딩
118test_text = "I love learning"
119encoded = vocab.encode(test_text)
120decoded = vocab.decode(encoded)
121print(f"\n원본: {test_text}")
122print(f"인코딩: {encoded}")
123print(f"디코딩: {decoded}")
124
125
126# ============================================
127# 4. 패딩
128# ============================================
129print("\n[4] 패딩")
130print("-" * 40)
131
132def pad_sequences(sequences, max_len=None, pad_value=0):
133    """시퀀스 패딩"""
134    if max_len is None:
135        max_len = max(len(seq) for seq in sequences)
136
137    padded = []
138    for seq in sequences:
139        if len(seq) > max_len:
140            padded.append(seq[:max_len])
141        else:
142            padded.append(seq + [pad_value] * (max_len - len(seq)))
143    return padded
144
145sequences = [
146    vocab.encode("I love learning"),
147    vocab.encode("Machine learning is great"),
148    vocab.encode("Deep")
149]
150
151print("원본 시퀀스:")
152for seq in sequences:
153    print(f"  {seq}")
154
155padded = pad_sequences(sequences, max_len=5)
156print("\n패딩 후:")
157for seq in padded:
158    print(f"  {seq}")
159
160
161# ============================================
162# 5. HuggingFace 토크나이저 (설치 필요)
163# ============================================
164print("\n[5] HuggingFace 토크나이저")
165print("-" * 40)
166
167try:
168    from transformers import AutoTokenizer
169
170    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
171
172    text = "Hello, how are you?"
173    encoded = tokenizer(text, return_tensors='pt')
174
175    print(f"텍스트: {text}")
176    print(f"토큰: {tokenizer.tokenize(text)}")
177    print(f"input_ids: {encoded['input_ids'].tolist()}")
178    print(f"attention_mask: {encoded['attention_mask'].tolist()}")
179
180    # 배치 인코딩
181    texts = ["Hello world", "How are you?", "I'm fine"]
182    batch_encoded = tokenizer(texts, padding=True, return_tensors='pt')
183    print(f"\n배치 인코딩 shape: {batch_encoded['input_ids'].shape}")
184
185except ImportError:
186    print("transformers 미설치 (pip install transformers)")
187
188
189# ============================================
190# 정리
191# ============================================
192print("\n" + "=" * 60)
193print("토큰화 정리")
194print("=" * 60)
195
196summary = """
197토큰화 파이프라인:
198    텍스트 → 전처리 → 토큰화 → 어휘 매핑 → 패딩 → 텐서
199
200주요 기법:
201    - 단어 토큰화: 공백/구두점 기준 분리
202    - 서브워드 토큰화: BPE, WordPiece, SentencePiece
203    - 어휘 사전: word2idx, idx2word 매핑
204
205HuggingFace 사용:
206    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
207    encoded = tokenizer(text, padding=True, return_tensors='pt')
208"""
209print(summary)