06. HuggingFace ๊ธฐ์ด
ํ์ต ๋ชฉํ
- Transformers ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ดํด
- Pipeline API ์ฌ์ฉ
- ํ ํฌ๋์ด์ ์ ๋ชจ๋ธ ๋ก๋
- ๋ค์ํ ํ์คํฌ ์ํ
1. HuggingFace ์ํ๊ณ
์ฃผ์ ๊ตฌ์ฑ์์
HuggingFace
โโโ Transformers # ๋ชจ๋ธ ๋ผ์ด๋ธ๋ฌ๋ฆฌ
โโโ Datasets # ๋ฐ์ดํฐ์
โโโ Tokenizers # ํ ํฌ๋์ด์
โโโ Hub # ๋ชจ๋ธ/๋ฐ์ดํฐ ์ ์ฅ์
โโโ Accelerate # ๋ถ์ฐ ํ์ต
โโโ Evaluate # ํ๊ฐ ๋ฉํธ๋ฆญ
์ค์น
pip install transformers datasets tokenizers accelerate evaluate
2. Pipeline API
๊ฐ์ฅ ๊ฐ๋จํ ์ฌ์ฉ๋ฒ
from transformers import pipeline
# ๊ฐ์ฑ ๋ถ์
classifier = pipeline("sentiment-analysis")
result = classifier("I love this movie!")
print(result)
# [{'label': 'POSITIVE', 'score': 0.9998}]
# ๋ฐฐ์น ์ฒ๋ฆฌ
results = classifier([
"I love this movie!",
"This is terrible."
])
์ง์ ํ์คํฌ
| ํ์คํฌ |
Pipeline ์ด๋ฆ |
์ค๋ช
|
| ๊ฐ์ฑ ๋ถ์ |
sentiment-analysis |
๊ธ์ /๋ถ์ ๋ถ๋ฅ |
| ํ
์คํธ ๋ถ๋ฅ |
text-classification |
์ผ๋ฐ ๋ถ๋ฅ |
| NER |
ner |
๊ฐ์ฒด๋ช
์ธ์ |
| QA |
question-answering |
์ง์์๋ต |
| ์์ฝ |
summarization |
ํ
์คํธ ์์ฝ |
| ๋ฒ์ญ |
translation |
์ธ์ด ๋ฒ์ญ |
| ํ
์คํธ ์์ฑ |
text-generation |
๋ฌธ์ฅ ์์ฑ |
| Fill-Mask |
fill-mask |
๋ง์คํฌ ์์ธก |
| Zero-shot |
zero-shot-classification |
๋ ์ด๋ธ ์๋ ๋ถ๋ฅ |
๋ค์ํ Pipeline ์์
# ์ง์์๋ต
qa = pipeline("question-answering")
result = qa(
question="What is the capital of France?",
context="Paris is the capital and most populous city of France."
)
# {'answer': 'Paris', 'score': 0.99, 'start': 0, 'end': 5}
# ์์ฝ
summarizer = pipeline("summarization")
text = "Very long article text here..."
summary = summarizer(text, max_length=50, min_length=10)
# ๋ฒ์ญ
translator = pipeline("translation_en_to_fr")
result = translator("Hello, how are you?")
# [{'translation_text': 'Bonjour, comment allez-vous?'}]
# ํ
์คํธ ์์ฑ
generator = pipeline("text-generation", model="gpt2")
result = generator("Once upon a time", max_length=50)
# NER
ner = pipeline("ner", grouped_entities=True)
result = ner("My name is John and I work at Google in New York")
# [{'entity_group': 'PER', 'word': 'John', ...},
# {'entity_group': 'ORG', 'word': 'Google', ...},
# {'entity_group': 'LOC', 'word': 'New York', ...}]
# Zero-shot ๋ถ๋ฅ
classifier = pipeline("zero-shot-classification")
result = classifier(
"I want to go to the beach",
candidate_labels=["travel", "cooking", "technology"]
)
# {'labels': ['travel', 'cooking', 'technology'], 'scores': [0.95, 0.03, 0.02]}
ํน์ ๋ชจ๋ธ ์ง์
# ํ๊ตญ์ด ๋ชจ๋ธ
classifier = pipeline(
"sentiment-analysis",
model="beomi/kcbert-base"
)
# ๋ค๊ตญ์ด ๋ชจ๋ธ
qa = pipeline(
"question-answering",
model="deepset/xlm-roberta-large-squad2"
)
3. ํ ํฌ๋์ด์
AutoTokenizer
from transformers import AutoTokenizer
# ์๋์ผ๋ก ์ ํฉํ ํ ํฌ๋์ด์ ๋ก๋
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# ์ธ์ฝ๋ฉ
text = "Hello, how are you?"
encoded = tokenizer(text)
print(encoded)
# {'input_ids': [101, 7592, ...], 'attention_mask': [1, 1, ...], ...}
# ํ
์๋ก ๋ฐํ
encoded = tokenizer(text, return_tensors='pt')
์ฃผ์ ํ๋ผ๋ฏธํฐ
encoded = tokenizer(
text,
padding=True, # ํจ๋ฉ ์ถ๊ฐ
truncation=True, # ์ต๋ ๊ธธ์ด ์๋ฅด๊ธฐ
max_length=128, # ์ต๋ ๊ธธ์ด
return_tensors='pt', # PyTorch ํ
์
return_attention_mask=True,
return_token_type_ids=True
)
๋ฐฐ์น ์ธ์ฝ๋ฉ
texts = ["Hello world", "How are you?", "I'm fine"]
# ๋์ ํจ๋ฉ
encoded = tokenizer(
texts,
padding=True, # ๊ฐ์ฅ ๊ธด ์ํ์ค์ ๋ง์ถค
truncation=True,
return_tensors='pt'
)
print(encoded['input_ids'].shape) # (3, max_len)
๋์ฝ๋ฉ
# ๋์ฝ๋ฉ
decoded = tokenizer.decode(encoded['input_ids'][0])
print(decoded) # "[CLS] hello world [SEP]"
# ํน์ ํ ํฐ ์ ๊ฑฐ
decoded = tokenizer.decode(encoded['input_ids'][0], skip_special_tokens=True)
print(decoded) # "hello world"
ํ ํฐ ํ์ธ
# ํ ํฐ ๋ชฉ๋ก
tokens = tokenizer.tokenize("Hello, how are you?")
print(tokens) # ['hello', ',', 'how', 'are', 'you', '?']
# ํ ํฐ โ ID
ids = tokenizer.convert_tokens_to_ids(tokens)
# ID โ ํ ํฐ
tokens = tokenizer.convert_ids_to_tokens(ids)
4. ๋ชจ๋ธ ๋ก๋
AutoModel
from transformers import AutoModel, AutoModelForSequenceClassification
# ๊ธฐ๋ณธ ๋ชจ๋ธ (์ถ๋ ฅ: ์๋ ์ํ)
model = AutoModel.from_pretrained("bert-base-uncased")
# ๋ถ๋ฅ ๋ชจ๋ธ (์ถ๋ ฅ: ๋ก์ง)
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels=2
)
ํ์คํฌ๋ณ AutoModel
from transformers import (
AutoModelForSequenceClassification, # ๋ฌธ์ฅ ๋ถ๋ฅ
AutoModelForTokenClassification, # ํ ํฐ ๋ถ๋ฅ (NER)
AutoModelForQuestionAnswering, # QA
AutoModelForCausalLM, # GPT ์คํ์ผ ์์ฑ
AutoModelForSeq2SeqLM, # ์ธ์ฝ๋-๋์ฝ๋ (๋ฒ์ญ, ์์ฝ)
AutoModelForMaskedLM # BERT ์คํ์ผ MLM
)
์ถ๋ก
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
# ์ธ์ฝ๋ฉ
inputs = tokenizer("I love this movie!", return_tensors="pt")
# ์ถ๋ก
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
# ์์ธก
predictions = torch.softmax(logits, dim=-1)
predicted_class = predictions.argmax().item()
print(f"Class: {predicted_class}, Confidence: {predictions[0][predicted_class]:.4f}")
5. Datasets ๋ผ์ด๋ธ๋ฌ๋ฆฌ
๋ฐ์ดํฐ์
๋ก๋
from datasets import load_dataset
# HuggingFace Hub์์ ๋ก๋
dataset = load_dataset("imdb")
print(dataset)
# DatasetDict({
# train: Dataset({features: ['text', 'label'], num_rows: 25000})
# test: Dataset({features: ['text', 'label'], num_rows: 25000})
# })
# ๋ถํ ์ง์
train_data = load_dataset("imdb", split="train")
test_data = load_dataset("imdb", split="test[:1000]") # ์ฒ์ 1000๊ฐ
# ์ํ ํ์ธ
print(train_data[0])
# {'text': '...', 'label': 1}
๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ
def preprocess(examples):
return tokenizer(
examples['text'],
truncation=True,
padding='max_length',
max_length=256
)
# map ์ ์ฉ
tokenized_dataset = dataset.map(preprocess, batched=True)
# ๋ถํ์ํ ์ปฌ๋ผ ์ ๊ฑฐ
tokenized_dataset = tokenized_dataset.remove_columns(['text'])
# PyTorch ํฌ๋งท ์ค์
tokenized_dataset.set_format('torch')
DataLoader ์์ฑ
from torch.utils.data import DataLoader
train_loader = DataLoader(
tokenized_dataset['train'],
batch_size=16,
shuffle=True
)
for batch in train_loader:
print(batch['input_ids'].shape) # (16, 256)
break
6. Trainer API
๊ธฐ๋ณธ ํ์ต
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer
)
from datasets import load_dataset
# ๋ฐ์ดํฐ
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize(examples):
return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)
tokenized = dataset.map(tokenize, batched=True)
# ๋ชจ๋ธ
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels=2
)
# ํ์ต ์ค์
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=100,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized['train'],
eval_dataset=tokenized['test'],
)
# ํ์ต
trainer.train()
# ํ๊ฐ
results = trainer.evaluate()
print(results)
์ปค์คํ
๋ฉํธ๋ฆญ
import evaluate
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = logits.argmax(axis=-1)
return accuracy.compute(predictions=predictions, references=labels)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized['train'],
eval_dataset=tokenized['test'],
compute_metrics=compute_metrics
)
7. ๋ชจ๋ธ ์ ์ฅ/๋ก๋
๋ก์ปฌ ์ ์ฅ
# ์ ์ฅ
model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model")
# ๋ก๋
model = AutoModelForSequenceClassification.from_pretrained("./my_model")
tokenizer = AutoTokenizer.from_pretrained("./my_model")
Hub์ ์
๋ก๋
# ๋ก๊ทธ์ธ
from huggingface_hub import login
login(token="your_token")
# ์
๋ก๋
model.push_to_hub("my-username/my-model")
tokenizer.push_to_hub("my-username/my-model")
# ๋๋ Trainer๋ก
trainer.push_to_hub("my-model")
8. ์ค์ ์์ : ๊ฐ์ฑ ๋ถ๋ฅ
import torch
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer
)
from datasets import load_dataset
import evaluate
# 1. ๋ฐ์ดํฐ ๋ก๋
dataset = load_dataset("imdb")
# 2. ํ ํฌ๋์ด์
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize(batch):
return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=256)
tokenized = dataset.map(tokenize, batched=True)
tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
# 3. ๋ชจ๋ธ
model = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased",
num_labels=2
)
# 4. ๋ฉํธ๋ฆญ
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions = eval_pred.predictions.argmax(axis=-1)
return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
# 5. ํ์ต ์ค์
args = TrainingArguments(
output_dir="./imdb_classifier",
num_train_epochs=2,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
learning_rate=2e-5,
warmup_ratio=0.1,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="accuracy",
fp16=torch.cuda.is_available(), # Mixed Precision
)
# 6. Trainer
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized['train'],
eval_dataset=tokenized['test'],
compute_metrics=compute_metrics,
)
# 7. ํ์ต
trainer.train()
# 8. ์ถ๋ก
def predict(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
with torch.no_grad():
outputs = model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)
return "Positive" if probs[0][1] > 0.5 else "Negative", probs[0][1].item()
print(predict("This movie was amazing!"))
# ('Positive', 0.9876)
์ ๋ฆฌ
ํต์ฌ ํด๋์ค
| ํด๋์ค |
์ฉ๋ |
| pipeline |
๋น ๋ฅธ ์ถ๋ก |
| AutoTokenizer |
ํ ํฌ๋์ด์ ์๋ ๋ก๋ |
| AutoModel* |
๋ชจ๋ธ ์๋ ๋ก๋ |
| Trainer |
ํ์ต ๋ฃจํ ์๋ํ |
| TrainingArguments |
ํ์ต ์ค์ |
ํต์ฌ ์ฝ๋
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
# ๋น ๋ฅธ ์ถ๋ก
classifier = pipeline("sentiment-analysis")
result = classifier("I love this!")
# ์ปค์คํ
์ถ๋ก
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
inputs = tokenizer("Hello", return_tensors="pt")
outputs = model(**inputs)
๋ค์ ๋จ๊ณ
07_Fine_Tuning.md์์ ๋ค์ํ ํ์คํฌ์ ๋ํ ํ์ธํ๋ ๊ธฐ๋ฒ์ ํ์ตํฉ๋๋ค.