1"""
201. NLP ๊ธฐ์ด - ํ ํฐํ ์์
3
4ํ
์คํธ ์ ์ฒ๋ฆฌ์ ํ ํฐํ ๊ธฐ๋ฒ ์ค์ต
5"""
6
7import re
8from collections import Counter
9
10print("=" * 60)
11print("NLP ๊ธฐ์ด: ํ ํฐํ")
12print("=" * 60)
13
14
15# ============================================
16# 1. ๊ธฐ๋ณธ ์ ์ฒ๋ฆฌ
17# ============================================
18print("\n[1] ๊ธฐ๋ณธ ์ ์ฒ๋ฆฌ")
19print("-" * 40)
20
21def preprocess(text):
22 """๊ธฐ๋ณธ ํ
์คํธ ์ ์ฒ๋ฆฌ"""
23 # ์๋ฌธ์ ๋ณํ
24 text = text.lower()
25 # ํน์๋ฌธ์ ์ ๊ฑฐ
26 text = re.sub(r'[^\w\s]', '', text)
27 # ๋ค์ค ๊ณต๋ฐฑ ์ ๋ฆฌ
28 text = re.sub(r'\s+', ' ', text).strip()
29 return text
30
31sample = "Hello, World! This is NLP processing."
32cleaned = preprocess(sample)
33print(f"์๋ณธ: {sample}")
34print(f"์ ์ฒ๋ฆฌ: {cleaned}")
35
36
37# ============================================
38# 2. ๋จ์ด ํ ํฐํ
39# ============================================
40print("\n[2] ๋จ์ด ํ ํฐํ")
41print("-" * 40)
42
43def simple_tokenize(text):
44 """๊ณต๋ฐฑ ๊ธฐ๋ฐ ํ ํฐํ"""
45 return text.lower().split()
46
47text = "I love natural language processing"
48tokens = simple_tokenize(text)
49print(f"ํ
์คํธ: {text}")
50print(f"ํ ํฐ: {tokens}")
51
52# NLTK ํ ํฐํ (์ค์น ํ์: pip install nltk)
53try:
54 import nltk
55 nltk.download('punkt', quiet=True)
56 from nltk.tokenize import word_tokenize
57
58 text2 = "I don't like it. It's not good!"
59 nltk_tokens = word_tokenize(text2)
60 print(f"\nNLTK ํ ํฐํ: {nltk_tokens}")
61except ImportError:
62 print("\nNLTK ๋ฏธ์ค์น (pip install nltk)")
63
64
65# ============================================
66# 3. ์ดํ ์ฌ์ ๊ตฌ์ถ
67# ============================================
68print("\n[3] ์ดํ ์ฌ์ ๊ตฌ์ถ")
69print("-" * 40)
70
71class Vocabulary:
72 def __init__(self, min_freq=1):
73 self.word2idx = {'<pad>': 0, '<unk>': 1, '<bos>': 2, '<eos>': 3}
74 self.idx2word = {0: '<pad>', 1: '<unk>', 2: '<bos>', 3: '<eos>'}
75 self.word_freq = Counter()
76 self.min_freq = min_freq
77
78 def build(self, texts):
79 """ํ
์คํธ ๋ฆฌ์คํธ๋ก ์ดํ ๊ตฌ์ถ"""
80 for text in texts:
81 tokens = simple_tokenize(text)
82 self.word_freq.update(tokens)
83
84 idx = len(self.word2idx)
85 for word, freq in self.word_freq.items():
86 if freq >= self.min_freq and word not in self.word2idx:
87 self.word2idx[word] = idx
88 self.idx2word[idx] = word
89 idx += 1
90
91 def encode(self, text):
92 """ํ
์คํธ๋ฅผ ์ธ๋ฑ์ค๋ก ๋ณํ"""
93 tokens = simple_tokenize(text)
94 return [self.word2idx.get(t, self.word2idx['<unk>']) for t in tokens]
95
96 def decode(self, indices):
97 """์ธ๋ฑ์ค๋ฅผ ํ ํฐ์ผ๋ก ๋ณํ"""
98 return [self.idx2word.get(i, '<unk>') for i in indices]
99
100 def __len__(self):
101 return len(self.word2idx)
102
103# ์ดํ ๊ตฌ์ถ
104texts = [
105 "I love machine learning",
106 "Machine learning is amazing",
107 "Deep learning is a subset of machine learning",
108 "I love deep learning"
109]
110
111vocab = Vocabulary(min_freq=1)
112vocab.build(texts)
113
114print(f"์ดํ ํฌ๊ธฐ: {len(vocab)}")
115print(f"์์ ๋น๋ ๋จ์ด: {vocab.word_freq.most_common(5)}")
116
117# ์ธ์ฝ๋ฉ/๋์ฝ๋ฉ
118test_text = "I love learning"
119encoded = vocab.encode(test_text)
120decoded = vocab.decode(encoded)
121print(f"\n์๋ณธ: {test_text}")
122print(f"์ธ์ฝ๋ฉ: {encoded}")
123print(f"๋์ฝ๋ฉ: {decoded}")
124
125
126# ============================================
127# 4. ํจ๋ฉ
128# ============================================
129print("\n[4] ํจ๋ฉ")
130print("-" * 40)
131
132def pad_sequences(sequences, max_len=None, pad_value=0):
133 """์ํ์ค ํจ๋ฉ"""
134 if max_len is None:
135 max_len = max(len(seq) for seq in sequences)
136
137 padded = []
138 for seq in sequences:
139 if len(seq) > max_len:
140 padded.append(seq[:max_len])
141 else:
142 padded.append(seq + [pad_value] * (max_len - len(seq)))
143 return padded
144
145sequences = [
146 vocab.encode("I love learning"),
147 vocab.encode("Machine learning is great"),
148 vocab.encode("Deep")
149]
150
151print("์๋ณธ ์ํ์ค:")
152for seq in sequences:
153 print(f" {seq}")
154
155padded = pad_sequences(sequences, max_len=5)
156print("\nํจ๋ฉ ํ:")
157for seq in padded:
158 print(f" {seq}")
159
160
161# ============================================
162# 5. HuggingFace ํ ํฌ๋์ด์ (์ค์น ํ์)
163# ============================================
164print("\n[5] HuggingFace ํ ํฌ๋์ด์ ")
165print("-" * 40)
166
167try:
168 from transformers import AutoTokenizer
169
170 tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
171
172 text = "Hello, how are you?"
173 encoded = tokenizer(text, return_tensors='pt')
174
175 print(f"ํ
์คํธ: {text}")
176 print(f"ํ ํฐ: {tokenizer.tokenize(text)}")
177 print(f"input_ids: {encoded['input_ids'].tolist()}")
178 print(f"attention_mask: {encoded['attention_mask'].tolist()}")
179
180 # ๋ฐฐ์น ์ธ์ฝ๋ฉ
181 texts = ["Hello world", "How are you?", "I'm fine"]
182 batch_encoded = tokenizer(texts, padding=True, return_tensors='pt')
183 print(f"\n๋ฐฐ์น ์ธ์ฝ๋ฉ shape: {batch_encoded['input_ids'].shape}")
184
185except ImportError:
186 print("transformers ๋ฏธ์ค์น (pip install transformers)")
187
188
189# ============================================
190# ์ ๋ฆฌ
191# ============================================
192print("\n" + "=" * 60)
193print("ํ ํฐํ ์ ๋ฆฌ")
194print("=" * 60)
195
196summary = """
197ํ ํฐํ ํ์ดํ๋ผ์ธ:
198 ํ
์คํธ โ ์ ์ฒ๋ฆฌ โ ํ ํฐํ โ ์ดํ ๋งคํ โ ํจ๋ฉ โ ํ
์
199
200์ฃผ์ ๊ธฐ๋ฒ:
201 - ๋จ์ด ํ ํฐํ: ๊ณต๋ฐฑ/๊ตฌ๋์ ๊ธฐ์ค ๋ถ๋ฆฌ
202 - ์๋ธ์๋ ํ ํฐํ: BPE, WordPiece, SentencePiece
203 - ์ดํ ์ฌ์ : word2idx, idx2word ๋งคํ
204
205HuggingFace ์ฌ์ฉ:
206 tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
207 encoded = tokenizer(text, padding=True, return_tensors='pt')
208"""
209print(summary)