07. νŒŒμΈνŠœλ‹

07. νŒŒμΈνŠœλ‹

ν•™μŠ΅ λͺ©ν‘œ

  • νŒŒμΈνŠœλ‹ μ „λž΅ 이해
  • λ‹€μ–‘ν•œ νƒœμŠ€ν¬ νŒŒμΈνŠœλ‹
  • 효율적인 νŒŒμΈνŠœλ‹ 기법 (LoRA, QLoRA)
  • μ‹€μ „ νŒŒμΈνŠœλ‹ νŒŒμ΄ν”„λΌμΈ

1. νŒŒμΈνŠœλ‹ κ°œμš”

μ „μ΄ν•™μŠ΅ νŒ¨λŸ¬λ‹€μž„

μ‚¬μ „ν•™μŠ΅ (Pre-training)
    β”‚  λŒ€κ·œλͺ¨ ν…μŠ€νŠΈλ‘œ 일반적인 μ–Έμ–΄ 이해 ν•™μŠ΅
    β–Ό
νŒŒμΈνŠœλ‹ (Fine-tuning)
    β”‚  νŠΉμ • νƒœμŠ€ν¬ λ°μ΄ν„°λ‘œ λͺ¨λΈ μ‘°μ •
    β–Ό
νƒœμŠ€ν¬ μˆ˜ν–‰

νŒŒμΈνŠœλ‹ μ „λž΅

μ „λž΅ μ„€λͺ… μ‚¬μš© μ‹œμ 
Full Fine-tuning 전체 νŒŒλΌλ―Έν„° μ—…λ°μ΄νŠΈ μΆ©λΆ„ν•œ 데이터, μ»΄ν“¨νŒ…
Feature Extraction λΆ„λ₯˜κΈ°λ§Œ ν•™μŠ΅ 적은 데이터
LoRA μ €λž­ν¬ μ–΄λŒ‘ν„° 효율적인 ν•™μŠ΅
Prompt Tuning ν”„λ‘¬ν”„νŠΈλ§Œ ν•™μŠ΅ 맀우 적은 데이터

2. ν…μŠ€νŠΈ λΆ„λ₯˜ νŒŒμΈνŠœλ‹

κΈ°λ³Έ νŒŒμ΄ν”„λΌμΈ

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
import evaluate

# 데이터 λ‘œλ“œ
dataset = load_dataset("imdb")

# ν† ν¬λ‚˜μ΄μ €
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch['text'],
        truncation=True,
        padding='max_length',
        max_length=256
    )

tokenized = dataset.map(tokenize, batched=True)

# λͺ¨λΈ
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

# ν•™μŠ΅ μ„€μ •
args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    eval_strategy="epoch",
)

# Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['test'],
)

trainer.train()

닀쀑 λ ˆμ΄λΈ” λΆ„λ₯˜

from transformers import AutoModelForSequenceClassification
import torch

# 닀쀑 λ ˆμ΄λΈ”μš© λͺ¨λΈ
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=5,
    problem_type="multi_label_classification"
)

# 손싀 ν•¨μˆ˜ μžλ™μœΌλ‘œ BCEWithLogitsLoss μ‚¬μš©

# λ ˆμ΄λΈ” ν˜•μ‹: [1, 0, 1, 0, 1] (닀쀑 λ ˆμ΄λΈ”)

3. 토큰 λΆ„λ₯˜ (NER) νŒŒμΈνŠœλ‹

NER 데이터 ν˜•μ‹

from datasets import load_dataset

# CoNLL-2003 NER 데이터셋
dataset = load_dataset("conll2003")

# μƒ˜ν”Œ
print(dataset['train'][0])
# {'tokens': ['EU', 'rejects', 'German', 'call', ...],
#  'ner_tags': [3, 0, 7, 0, ...]}

# λ ˆμ΄λΈ”
label_names = dataset['train'].features['ner_tags'].feature.names
# ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

토큰 μ •λ ¬

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True  # 이미 ν† ν°ν™”λœ μž…λ ₯
    )

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # 특수 토큰
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # 첫 토큰
            else:
                label_ids.append(-100)  # μ„œλΈŒμ›Œλ“œ λ¬΄μ‹œ
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized['labels'] = labels
    return tokenized

NER νŒŒμΈνŠœλ‹

from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_names)
)

# seqeval λ©”νŠΈλ¦­
import evaluate
seqeval = evaluate.load("seqeval")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    # μ‹€μ œ λ ˆμ΄λΈ”λ§Œ μΆ”μΆœ
    true_predictions = []
    true_labels = []

    for pred, label in zip(predictions, labels):
        true_preds = []
        true_labs = []
        for p, l in zip(pred, label):
            if l != -100:
                true_preds.append(label_names[p])
                true_labs.append(label_names[l])
        true_predictions.append(true_preds)
        true_labels.append(true_labs)

    return seqeval.compute(predictions=true_predictions, references=true_labels)

4. μ§ˆμ˜μ‘λ‹΅ (QA) νŒŒμΈνŠœλ‹

SQuAD 데이터

dataset = load_dataset("squad")

print(dataset['train'][0])
# {'id': '...', 'title': 'University_of_Notre_Dame',
#  'context': 'Architecturally, the school has...',
#  'question': 'To whom did the Virgin Mary appear in 1858?',
#  'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

QA μ „μ²˜λ¦¬

def prepare_train_features(examples):
    tokenized = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")

    tokenized["start_positions"] = []
    tokenized["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sample_idx = sample_mapping[i]
        answers = examples["answers"][sample_idx]

        if len(answers["answer_start"]) == 0:
            tokenized["start_positions"].append(cls_index)
            tokenized["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # 토큰 μœ„μΉ˜ μ°ΎκΈ°
            token_start = 0
            token_end = 0
            for idx, (start, end) in enumerate(offsets):
                if start <= start_char < end:
                    token_start = idx
                if start < end_char <= end:
                    token_end = idx
                    break

            tokenized["start_positions"].append(token_start)
            tokenized["end_positions"].append(token_end)

    return tokenized

QA λͺ¨λΈ

from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

# 좜λ ₯: start_logits, end_logits

5. 효율적인 νŒŒμΈνŠœλ‹ (PEFT)

LoRA (Low-Rank Adaptation)

from peft import LoraConfig, get_peft_model, TaskType

# LoRA μ„€μ •
lora_config = LoraConfig(
    r=8,                      # 랭크
    lora_alpha=32,            # μŠ€μΌ€μΌλ§
    target_modules=["query", "value"],  # 적용 λͺ¨λ“ˆ
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

# λͺ¨λΈμ— LoRA 적용
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model = get_peft_model(model, lora_config)

# ν•™μŠ΅ κ°€λŠ₯ν•œ νŒŒλΌλ―Έν„° 확인
model.print_trainable_parameters()
# trainable params: 294,912 || all params: 109,482,240 || trainable%: 0.27%

QLoRA (Quantized LoRA)

from transformers import BitsAndBytesConfig
import torch

# 4λΉ„νŠΈ μ–‘μžν™” μ„€μ •
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# μ–‘μžν™”λœ λͺ¨λΈ λ‘œλ“œ
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=bnb_config,
    device_map="auto"
)

# LoRA 적용
model = get_peft_model(model, lora_config)

Prompt Tuning

from peft import PromptTuningConfig, get_peft_model

config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    num_virtual_tokens=8,
    prompt_tuning_init="TEXT",
    prompt_tuning_init_text="Classify the sentiment: "
)

model = get_peft_model(model, config)

6. λŒ€ν™”ν˜• λͺ¨λΈ νŒŒμΈνŠœλ‹

Instruction Tuning 데이터 ν˜•μ‹

# Alpaca ν˜•μ‹
{
    "instruction": "Summarize the following text.",
    "input": "Long article text here...",
    "output": "Summary of the article."
}

# ChatML ν˜•μ‹
"""
<|system|>
You are a helpful assistant.
<|user|>
What is the capital of France?
<|assistant|>
The capital of France is Paris.
"""

SFT (Supervised Fine-Tuning)

from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=512,
    args=TrainingArguments(
        output_dir="./sft_output",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        learning_rate=2e-5,
    ),
)

trainer.train()

DPO (Direct Preference Optimization)

from trl import DPOTrainer

# μ„ ν˜Έλ„ 데이터
# {'prompt': '...', 'chosen': '...', 'rejected': '...'}

trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,  # κΈ°μ€€ λͺ¨λΈ
    train_dataset=dataset,
    beta=0.1,
    args=TrainingArguments(...),
)

trainer.train()

7. ν•™μŠ΅ μ΅œμ ν™”

Gradient Checkpointing

model.gradient_checkpointing_enable()

Mixed Precision

args = TrainingArguments(
    ...,
    fp16=True,  # λ˜λŠ” bf16=True
)

Gradient Accumulation

args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,  # μ‹€νš¨ 배치 = 4 * 8 = 32
)

DeepSpeed

args = TrainingArguments(
    ...,
    deepspeed="ds_config.json"
)

# ds_config.json
{
    "fp16": {"enabled": true},
    "zero_optimization": {"stage": 2}
}

8. 전체 νŒŒμΈνŠœλ‹ 예제

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import evaluate

# 1. 데이터
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=256)

tokenized = dataset.map(tokenize, batched=True)
tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# 2. λͺ¨λΈ + LoRA
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    task_type=TaskType.SEQ_CLS
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# 3. ν•™μŠ΅ μ„€μ •
args = TrainingArguments(
    output_dir="./lora_imdb",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=1e-4,
    warmup_ratio=0.1,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
)

# 4. λ©”νŠΈλ¦­
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions = eval_pred.predictions.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

# 5. ν•™μŠ΅
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['test'],
    compute_metrics=compute_metrics,
)

trainer.train()

# 6. μ €μž₯
model.save_pretrained("./lora_imdb_final")

정리

νŒŒμΈνŠœλ‹ 선택 κ°€μ΄λ“œ

상황 μΆ”μ²œ 방법
μΆ©λΆ„ν•œ 데이터 + GPU Full Fine-tuning
μ œν•œλœ GPU λ©”λͺ¨λ¦¬ LoRA / QLoRA
맀우 적은 데이터 Prompt Tuning
LLM μ •λ ¬ SFT + DPO/RLHF

핡심 μ½”λ“œ

# LoRA
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(r=8, target_modules=["query", "value"])
model = get_peft_model(model, lora_config)

# Trainer
trainer = Trainer(model=model, args=args, train_dataset=dataset)
trainer.train()

λ‹€μŒ 단계

08_Prompt_Engineering.mdμ—μ„œ 효과적인 ν”„λ‘¬ν”„νŠΈ μž‘μ„± 기법을 ν•™μŠ΅ν•©λ‹ˆλ‹€.

to navigate between lessons