02. Word2Vecκ³Ό GloVe
02. Word2Vecκ³Ό GloVe¶
νμ΅ λͺ©ν¶
- λΆμ° ννμ κ°λ
- Word2Vec (Skip-gram, CBOW)
- GloVe μλ² λ©
- μ¬μ νμ΅ μλ² λ© νμ©
1. λ¨μ΄ μλ² λ© κ°μ¶
One-Hot vs λΆμ° νν¶
One-Hot (ν¬μ νν):
"king" β [1, 0, 0, 0, ...] (Vμ°¨μ)
"queen" β [0, 1, 0, 0, ...]
λ¬Έμ : μλ―Έμ μ μ¬μ± νν λΆκ°
cosine_similarity(king, queen) = 0
λΆμ° νν (Dense):
"king" β [0.2, -0.5, 0.8, ...] (dμ°¨μ, d << V)
"queen" β [0.3, -0.4, 0.7, ...]
μ₯μ : μλ―Έμ μ μ¬μ± λ°μ
cosine_similarity(king, queen) β 0.9
λΆμ° κ°μ€¶
"κ°μ λ§₯λ½μμ λ±μ₯νλ λ¨μ΄λ λΉμ·ν μλ―Έλ₯Ό κ°λλ€" (You shall know a word by the company it keeps)
"The cat sat on the ___" β mat, floor, couch
"The dog lay on the ___" β mat, floor, couch
cat β dog (μ μ¬ν λ§₯λ½)
2. Word2Vec¶
Skip-gram¶
μ£Όλ³ λ¨μ΄λ₯Ό μμΈ‘νμ¬ μ€μ¬ λ¨μ΄ νν νμ΅
μ
λ ₯: center word β μμΈ‘: context words
λ¬Έμ₯: "The quick brown fox jumps"
μ€μ¬ λ¨μ΄: "brown" (window=2)
μμΈ‘ λμ: ["quick", "fox"] λλ ["The", "quick", "fox", "jumps"]
λͺ¨λΈ:
"brown" β μλ² λ© β Softmax β P(context | center)
CBOW (Continuous Bag of Words)¶
μ£Όλ³ λ¨μ΄λ‘ μ€μ¬ λ¨μ΄ μμΈ‘
μ
λ ₯: context words β μμΈ‘: center word
λ¬Έμ₯: "The quick brown fox jumps"
μ£Όλ³ λ¨μ΄: ["quick", "fox"]
μμΈ‘ λμ: "brown"
λͺ¨λΈ:
["quick", "fox"] β νκ· μλ² λ© β Softmax β P(center | context)
Word2Vec ꡬ쑰¶
import torch
import torch.nn as nn
class SkipGram(nn.Module):
def __init__(self, vocab_size, embed_dim):
super().__init__()
# μ
λ ₯ μλ² λ© (μ€μ¬ λ¨μ΄)
self.center_embeddings = nn.Embedding(vocab_size, embed_dim)
# μΆλ ₯ μλ² λ© (μ£Όλ³ λ¨μ΄)
self.context_embeddings = nn.Embedding(vocab_size, embed_dim)
def forward(self, center, context):
# center: (batch,)
# context: (batch,)
center_emb = self.center_embeddings(center) # (batch, embed)
context_emb = self.context_embeddings(context) # (batch, embed)
# λ΄μ μΌλ‘ μ μ¬λ κ³μ°
score = (center_emb * context_emb).sum(dim=1) # (batch,)
return score
class CBOW(nn.Module):
def __init__(self, vocab_size, embed_dim):
super().__init__()
self.context_embeddings = nn.Embedding(vocab_size, embed_dim)
self.center_embeddings = nn.Embedding(vocab_size, embed_dim)
def forward(self, context, center):
# context: (batch, window*2)
# center: (batch,)
context_emb = self.context_embeddings(context) # (batch, window*2, embed)
context_mean = context_emb.mean(dim=1) # (batch, embed)
center_emb = self.center_embeddings(center) # (batch, embed)
score = (context_mean * center_emb).sum(dim=1)
return score
Negative Sampling¶
μ 체 μ΄νμ λν Softmaxλ κ³μ° λΉμ©μ΄ νΌ
class SkipGramNegSampling(nn.Module):
def __init__(self, vocab_size, embed_dim):
super().__init__()
self.center_embeddings = nn.Embedding(vocab_size, embed_dim)
self.context_embeddings = nn.Embedding(vocab_size, embed_dim)
def forward(self, center, context, neg_context):
# center: (batch,)
# context: (batch,) - μ€μ μ£Όλ³ λ¨μ΄
# neg_context: (batch, k) - λλ€ μνλ§λ λ¨μ΄
center_emb = self.center_embeddings(center) # (batch, embed)
# Positive: μ€μ μ£Όλ³ λ¨μ΄μμ μ μ¬λ
pos_emb = self.context_embeddings(context)
pos_score = (center_emb * pos_emb).sum(dim=1) # (batch,)
# Negative: λλ€ λ¨μ΄μμ μ μ¬λ
neg_emb = self.context_embeddings(neg_context) # (batch, k, embed)
neg_score = torch.bmm(neg_emb, center_emb.unsqueeze(2)).squeeze() # (batch, k)
return pos_score, neg_score
# μμ€ ν¨μ
def negative_sampling_loss(pos_score, neg_score):
pos_loss = -torch.log(torch.sigmoid(pos_score) + 1e-10)
neg_loss = -torch.log(torch.sigmoid(-neg_score) + 1e-10).sum(dim=1)
return (pos_loss + neg_loss).mean()
3. GloVe¶
κ°λ ¶
μ μ λμ μΆν ν΅κ³ νμ©
λμ μΆν νλ ¬ X:
X[i,j] = λ¨μ΄ iμ jκ° ν¨κ» λ±μ₯ν νμ
λͺ©ν:
w_i Β· w_j + b_i + b_j β log(X[i,j])
GloVe μμ€ ν¨μ¶
def glove_loss(w_i, w_j, b_i, b_j, X_ij, x_max=100, alpha=0.75):
"""
w_i, w_j: λ¨μ΄ μλ² λ©
b_i, b_j: νΈν₯
X_ij: λμ μΆν νμ
"""
# κ°μ€μΉ ν¨μ (λΉλκ° λ무 λμ λ¨μ΄ μν)
weight = torch.clamp(X_ij / x_max, max=1.0) ** alpha
# μμΈ‘κ³Ό μ€μ μ μ°¨μ΄
prediction = (w_i * w_j).sum(dim=1) + b_i + b_j
target = torch.log(X_ij + 1e-10)
loss = weight * (prediction - target) ** 2
return loss.mean()
GloVe ꡬν¶
class GloVe(nn.Module):
def __init__(self, vocab_size, embed_dim):
super().__init__()
# λ μλ² λ© νλ ¬
self.w_embeddings = nn.Embedding(vocab_size, embed_dim)
self.c_embeddings = nn.Embedding(vocab_size, embed_dim)
self.w_bias = nn.Embedding(vocab_size, 1)
self.c_bias = nn.Embedding(vocab_size, 1)
def forward(self, i, j, cooccur):
w_i = self.w_embeddings(i)
w_j = self.c_embeddings(j)
b_i = self.w_bias(i).squeeze()
b_j = self.c_bias(j).squeeze()
return glove_loss(w_i, w_j, b_i, b_j, cooccur)
def get_embedding(self, word_idx):
# μ΅μ’
μλ² λ©: λ μλ² λ©μ νκ·
return (self.w_embeddings.weight[word_idx] +
self.c_embeddings.weight[word_idx]) / 2
4. μ¬μ νμ΅ μλ² λ© μ¬μ©¶
Gensim Word2Vec¶
from gensim.models import Word2Vec
# νμ΅
sentences = [["I", "love", "NLP"], ["NLP", "is", "fun"]]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)
# μ μ¬ λ¨μ΄
similar = model.wv.most_similar("NLP", topn=5)
# λ²‘ν° κ°μ Έμ€κΈ°
vector = model.wv["NLP"]
# μ μ₯/λ‘λ
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")
μ¬μ νμ΅ GloVe¶
import numpy as np
def load_glove(path, embed_dim=100):
"""GloVe ν
μ€νΈ νμΌ λ‘λ"""
embeddings = {}
with open(path, 'r', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
vector = np.array(values[1:], dtype='float32')
embeddings[word] = vector
return embeddings
# μ¬μ©
glove = load_glove('glove.6B.100d.txt')
vector = glove.get('king', np.zeros(100))
PyTorch μλ² λ© λ μ΄μ΄μ μ μ©¶
import torch
import torch.nn as nn
def create_embedding_layer(vocab, glove, embed_dim=100, freeze=True):
"""μ¬μ νμ΅ μλ² λ©μΌλ‘ Embedding λ μ΄μ΄ μ΄κΈ°ν"""
vocab_size = len(vocab)
embedding_matrix = torch.zeros(vocab_size, embed_dim)
found = 0
for word, idx in vocab.word2idx.items():
if word in glove:
embedding_matrix[idx] = torch.from_numpy(glove[word])
found += 1
else:
# λλ€ μ΄κΈ°ν
embedding_matrix[idx] = torch.randn(embed_dim) * 0.1
print(f"μ¬μ νμ΅ μλ² λ© μ μ©: {found}/{vocab_size}")
embedding = nn.Embedding.from_pretrained(
embedding_matrix,
freeze=freeze, # Trueλ©΄ νμ΅νμ§ μμ
padding_idx=vocab.word2idx.get('<pad>', 0)
)
return embedding
# λͺ¨λΈμ μ μ©
class TextClassifier(nn.Module):
def __init__(self, vocab, glove, num_classes):
super().__init__()
self.embedding = create_embedding_layer(vocab, glove, freeze=False)
self.fc = nn.Linear(100, num_classes)
def forward(self, x):
embedded = self.embedding(x) # (batch, seq, 100)
pooled = embedded.mean(dim=1) # νκ· νλ§
return self.fc(pooled)
5. μλ² λ© μ°μ°¶
μ μ¬λ κ³μ°¶
import torch
import torch.nn.functional as F
def cosine_similarity(v1, v2):
return F.cosine_similarity(v1.unsqueeze(0), v2.unsqueeze(0))
# κ°μ₯ μ μ¬ν λ¨μ΄ μ°ΎκΈ°
def most_similar(word, embeddings, vocab, topk=5):
word_vec = embeddings[vocab[word]]
similarities = F.cosine_similarity(word_vec.unsqueeze(0), embeddings)
values, indices = similarities.topk(topk + 1)
results = []
for val, idx in zip(values[1:], indices[1:]): # μκΈ° μμ μ μΈ
results.append((vocab.idx2word[idx.item()], val.item()))
return results
λ¨μ΄ μ°μ°¶
def word_analogy(a, b, c, embeddings, vocab, topk=5):
"""
a : b = c : ?
μ: king : queen = man : woman
vector(?) = vector(b) - vector(a) + vector(c)
"""
vec_a = embeddings[vocab[a]]
vec_b = embeddings[vocab[b]]
vec_c = embeddings[vocab[c]]
# μ μΆ λ²‘ν°
target_vec = vec_b - vec_a + vec_c
# κ°μ₯ μ μ¬ν λ¨μ΄ μ°ΎκΈ°
similarities = F.cosine_similarity(target_vec.unsqueeze(0), embeddings)
values, indices = similarities.topk(topk + 3)
# a, b, c μ μΈ
exclude = {vocab[a], vocab[b], vocab[c]}
results = []
for val, idx in zip(values, indices):
if idx.item() not in exclude:
results.append((vocab.idx2word[idx.item()], val.item()))
if len(results) == topk:
break
return results
# μμ
# word_analogy("king", "queen", "man", embeddings, vocab)
# β [("woman", 0.85), ...]
6. μκ°ν¶
t-SNE μκ°ν¶
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
def visualize_embeddings(embeddings, words, vocab):
# μ νν λ¨μ΄μ μλ² λ©
indices = [vocab[w] for w in words]
vectors = embeddings[indices].numpy()
# t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(words)-1))
reduced = tsne.fit_transform(vectors)
# μκ°ν
plt.figure(figsize=(12, 8))
plt.scatter(reduced[:, 0], reduced[:, 1])
for i, word in enumerate(words):
plt.annotate(word, (reduced[i, 0], reduced[i, 1]))
plt.title('Word Embeddings (t-SNE)')
plt.savefig('embeddings_tsne.png')
plt.close()
# μ¬μ©
words = ['king', 'queen', 'man', 'woman', 'dog', 'cat', 'apple', 'orange']
visualize_embeddings(embeddings, words, vocab)
7. Word2Vec vs GloVe λΉκ΅¶
| νλͺ© | Word2Vec | GloVe |
|---|---|---|
| λ°©μ | μμΈ‘ κΈ°λ° | ν΅κ³ κΈ°λ° |
| νμ΅ | μλμ° λ΄ λ¨μ΄ | μ μ λμ μΆν |
| λ©λͺ¨λ¦¬ | μ μ | λμ μΆν νλ ¬ νμ |
| νμ΅ μλ | Negative SamplingμΌλ‘ λΉ λ¦ | νλ ¬ μ μ²λ¦¬ ν λΉ λ¦ |
| μ±λ₯ | μ μ¬ | μ μ¬ |
μ 리¶
ν΅μ¬ κ°λ ¶
- λΆμ° νν: λ¨μ΄λ₯Ό λ°μ§ 벑ν°λ‘ νν
- Skip-gram: μ€μ¬ β μ£Όλ³ μμΈ‘
- CBOW: μ£Όλ³ β μ€μ¬ μμΈ‘
- GloVe: λμ μΆν ν΅κ³ νμ©
- λ¨μ΄ μ°μ°: king - queen + man β woman
ν΅μ¬ μ½λ¶
# Gensim Word2Vec
from gensim.models import Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5)
# μ¬μ νμ΅ μλ² λ© μ μ©
embedding = nn.Embedding.from_pretrained(pretrained_matrix, freeze=False)
# μ μ¬λ
similarity = F.cosine_similarity(vec1, vec2)
λ€μ λ¨κ³¶
03_Transformer_Review.mdμμ Transformer μν€ν μ²λ₯Ό NLP κ΄μ μμ 볡μ΅ν©λλ€.