01_tokenization.py

Download
python 210 lines 5.6 KB
  1"""
  201. NLP ๊ธฐ์ดˆ - ํ† ํฐํ™” ์˜ˆ์ œ
  3
  4ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ์™€ ํ† ํฐํ™” ๊ธฐ๋ฒ• ์‹ค์Šต
  5"""
  6
  7import re
  8from collections import Counter
  9
 10print("=" * 60)
 11print("NLP ๊ธฐ์ดˆ: ํ† ํฐํ™”")
 12print("=" * 60)
 13
 14
 15# ============================================
 16# 1. ๊ธฐ๋ณธ ์ „์ฒ˜๋ฆฌ
 17# ============================================
 18print("\n[1] ๊ธฐ๋ณธ ์ „์ฒ˜๋ฆฌ")
 19print("-" * 40)
 20
 21def preprocess(text):
 22    """๊ธฐ๋ณธ ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ"""
 23    # ์†Œ๋ฌธ์ž ๋ณ€ํ™˜
 24    text = text.lower()
 25    # ํŠน์ˆ˜๋ฌธ์ž ์ œ๊ฑฐ
 26    text = re.sub(r'[^\w\s]', '', text)
 27    # ๋‹ค์ค‘ ๊ณต๋ฐฑ ์ •๋ฆฌ
 28    text = re.sub(r'\s+', ' ', text).strip()
 29    return text
 30
 31sample = "Hello, World! This is NLP   processing."
 32cleaned = preprocess(sample)
 33print(f"์›๋ณธ: {sample}")
 34print(f"์ „์ฒ˜๋ฆฌ: {cleaned}")
 35
 36
 37# ============================================
 38# 2. ๋‹จ์–ด ํ† ํฐํ™”
 39# ============================================
 40print("\n[2] ๋‹จ์–ด ํ† ํฐํ™”")
 41print("-" * 40)
 42
 43def simple_tokenize(text):
 44    """๊ณต๋ฐฑ ๊ธฐ๋ฐ˜ ํ† ํฐํ™”"""
 45    return text.lower().split()
 46
 47text = "I love natural language processing"
 48tokens = simple_tokenize(text)
 49print(f"ํ…์ŠคํŠธ: {text}")
 50print(f"ํ† ํฐ: {tokens}")
 51
 52# NLTK ํ† ํฐํ™” (์„ค์น˜ ํ•„์š”: pip install nltk)
 53try:
 54    import nltk
 55    nltk.download('punkt', quiet=True)
 56    from nltk.tokenize import word_tokenize
 57
 58    text2 = "I don't like it. It's not good!"
 59    nltk_tokens = word_tokenize(text2)
 60    print(f"\nNLTK ํ† ํฐํ™”: {nltk_tokens}")
 61except ImportError:
 62    print("\nNLTK ๋ฏธ์„ค์น˜ (pip install nltk)")
 63
 64
 65# ============================================
 66# 3. ์–ดํœ˜ ์‚ฌ์ „ ๊ตฌ์ถ•
 67# ============================================
 68print("\n[3] ์–ดํœ˜ ์‚ฌ์ „ ๊ตฌ์ถ•")
 69print("-" * 40)
 70
 71class Vocabulary:
 72    def __init__(self, min_freq=1):
 73        self.word2idx = {'<pad>': 0, '<unk>': 1, '<bos>': 2, '<eos>': 3}
 74        self.idx2word = {0: '<pad>', 1: '<unk>', 2: '<bos>', 3: '<eos>'}
 75        self.word_freq = Counter()
 76        self.min_freq = min_freq
 77
 78    def build(self, texts):
 79        """ํ…์ŠคํŠธ ๋ฆฌ์ŠคํŠธ๋กœ ์–ดํœ˜ ๊ตฌ์ถ•"""
 80        for text in texts:
 81            tokens = simple_tokenize(text)
 82            self.word_freq.update(tokens)
 83
 84        idx = len(self.word2idx)
 85        for word, freq in self.word_freq.items():
 86            if freq >= self.min_freq and word not in self.word2idx:
 87                self.word2idx[word] = idx
 88                self.idx2word[idx] = word
 89                idx += 1
 90
 91    def encode(self, text):
 92        """ํ…์ŠคํŠธ๋ฅผ ์ธ๋ฑ์Šค๋กœ ๋ณ€ํ™˜"""
 93        tokens = simple_tokenize(text)
 94        return [self.word2idx.get(t, self.word2idx['<unk>']) for t in tokens]
 95
 96    def decode(self, indices):
 97        """์ธ๋ฑ์Šค๋ฅผ ํ† ํฐ์œผ๋กœ ๋ณ€ํ™˜"""
 98        return [self.idx2word.get(i, '<unk>') for i in indices]
 99
100    def __len__(self):
101        return len(self.word2idx)
102
103# ์–ดํœ˜ ๊ตฌ์ถ•
104texts = [
105    "I love machine learning",
106    "Machine learning is amazing",
107    "Deep learning is a subset of machine learning",
108    "I love deep learning"
109]
110
111vocab = Vocabulary(min_freq=1)
112vocab.build(texts)
113
114print(f"์–ดํœ˜ ํฌ๊ธฐ: {len(vocab)}")
115print(f"์ƒ์œ„ ๋นˆ๋„ ๋‹จ์–ด: {vocab.word_freq.most_common(5)}")
116
117# ์ธ์ฝ”๋”ฉ/๋””์ฝ”๋”ฉ
118test_text = "I love learning"
119encoded = vocab.encode(test_text)
120decoded = vocab.decode(encoded)
121print(f"\n์›๋ณธ: {test_text}")
122print(f"์ธ์ฝ”๋”ฉ: {encoded}")
123print(f"๋””์ฝ”๋”ฉ: {decoded}")
124
125
126# ============================================
127# 4. ํŒจ๋”ฉ
128# ============================================
129print("\n[4] ํŒจ๋”ฉ")
130print("-" * 40)
131
132def pad_sequences(sequences, max_len=None, pad_value=0):
133    """์‹œํ€€์Šค ํŒจ๋”ฉ"""
134    if max_len is None:
135        max_len = max(len(seq) for seq in sequences)
136
137    padded = []
138    for seq in sequences:
139        if len(seq) > max_len:
140            padded.append(seq[:max_len])
141        else:
142            padded.append(seq + [pad_value] * (max_len - len(seq)))
143    return padded
144
145sequences = [
146    vocab.encode("I love learning"),
147    vocab.encode("Machine learning is great"),
148    vocab.encode("Deep")
149]
150
151print("์›๋ณธ ์‹œํ€€์Šค:")
152for seq in sequences:
153    print(f"  {seq}")
154
155padded = pad_sequences(sequences, max_len=5)
156print("\nํŒจ๋”ฉ ํ›„:")
157for seq in padded:
158    print(f"  {seq}")
159
160
161# ============================================
162# 5. HuggingFace ํ† ํฌ๋‚˜์ด์ € (์„ค์น˜ ํ•„์š”)
163# ============================================
164print("\n[5] HuggingFace ํ† ํฌ๋‚˜์ด์ €")
165print("-" * 40)
166
167try:
168    from transformers import AutoTokenizer
169
170    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
171
172    text = "Hello, how are you?"
173    encoded = tokenizer(text, return_tensors='pt')
174
175    print(f"ํ…์ŠคํŠธ: {text}")
176    print(f"ํ† ํฐ: {tokenizer.tokenize(text)}")
177    print(f"input_ids: {encoded['input_ids'].tolist()}")
178    print(f"attention_mask: {encoded['attention_mask'].tolist()}")
179
180    # ๋ฐฐ์น˜ ์ธ์ฝ”๋”ฉ
181    texts = ["Hello world", "How are you?", "I'm fine"]
182    batch_encoded = tokenizer(texts, padding=True, return_tensors='pt')
183    print(f"\n๋ฐฐ์น˜ ์ธ์ฝ”๋”ฉ shape: {batch_encoded['input_ids'].shape}")
184
185except ImportError:
186    print("transformers ๋ฏธ์„ค์น˜ (pip install transformers)")
187
188
189# ============================================
190# ์ •๋ฆฌ
191# ============================================
192print("\n" + "=" * 60)
193print("ํ† ํฐํ™” ์ •๋ฆฌ")
194print("=" * 60)
195
196summary = """
197ํ† ํฐํ™” ํŒŒ์ดํ”„๋ผ์ธ:
198    ํ…์ŠคํŠธ โ†’ ์ „์ฒ˜๋ฆฌ โ†’ ํ† ํฐํ™” โ†’ ์–ดํœ˜ ๋งคํ•‘ โ†’ ํŒจ๋”ฉ โ†’ ํ…์„œ
199
200์ฃผ์š” ๊ธฐ๋ฒ•:
201    - ๋‹จ์–ด ํ† ํฐํ™”: ๊ณต๋ฐฑ/๊ตฌ๋‘์  ๊ธฐ์ค€ ๋ถ„๋ฆฌ
202    - ์„œ๋ธŒ์›Œ๋“œ ํ† ํฐํ™”: BPE, WordPiece, SentencePiece
203    - ์–ดํœ˜ ์‚ฌ์ „: word2idx, idx2word ๋งคํ•‘
204
205HuggingFace ์‚ฌ์šฉ:
206    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
207    encoded = tokenizer(text, padding=True, return_tensors='pt')
208"""
209print(summary)