16. LLM ํ๊ฐ ์งํ (Evaluation Metrics)
ํ์ต ๋ชฉํ
- ํ
์คํธ ์์ฑ ํ๊ฐ ์งํ ์ดํด (BLEU, ROUGE, BERTScore)
- ์ฝ๋ ์์ฑ ํ๊ฐ (HumanEval, MBPP)
- LLM ๋ฒค์น๋งํฌ (MMLU, HellaSwag, TruthfulQA)
- ์ธ๊ฐ ํ๊ฐ์ ์๋ ํ๊ฐ
1. ํ๊ฐ์ ์ค์์ฑ
LLM ํ๊ฐ์ ์ด๋ ค์
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ LLM ํ๊ฐ์ ์ด๋ ค์ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ 1. ์ ๋ต์ด ์ฌ๋ฌ ๊ฐ: ๊ฐ์ ์ง๋ฌธ์ ๋ค์ํ ์ ๋ต ๊ฐ๋ฅ โ
โ โ
โ 2. ์ฃผ๊ด์ ํ์ง: "์ข์" ์๋ต์ ๊ธฐ์ค์ด ๋ชจํธ โ
โ โ
โ 3. ํ์คํฌ ๋ค์์ฑ: ์์ฝ, ๋ํ, ์ฝ๋, ์ถ๋ก ๋ฑ ๋ค์ โ
โ โ
โ 4. ์ง์ ์์ : ํ์ต ๋ฐ์ดํฐ ๊ธฐ์ค ์์ โ
โ โ
โ 5. ์์ ์ฑ: ์ ํด์ฑ, ํธํฅ, ํ๊ฐ ์ธก์ โ
โ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
ํ๊ฐ ์ ํ
| ํ๊ฐ ์ ํ |
์ค๋ช
|
์์ |
| ์๋ ํ๊ฐ |
์๊ณ ๋ฆฌ์ฆ ๊ธฐ๋ฐ ์ ์ |
BLEU, ROUGE, Perplexity |
| ๋ชจ๋ธ ๊ธฐ๋ฐ ํ๊ฐ |
LLM์ด ํ๊ฐ |
GPT-4 as Judge |
| ์ธ๊ฐ ํ๊ฐ |
์ฌ๋์ด ์ง์ ํ๊ฐ |
A/B ํ
์คํธ, ๋ฆฌ์ปคํธ ์ฒ๋ |
| ๋ฒค์น๋งํฌ |
ํ์คํ๋ ํ
์คํธ์
|
MMLU, HumanEval |
2. ํ
์คํธ ์ ์ฌ๋ ์งํ
BLEU (Bilingual Evaluation Understudy)
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
import nltk
nltk.download('punkt')
def calculate_bleu(reference, candidate):
"""BLEU ์ ์ ๊ณ์ฐ"""
# ํ ํฐํ
reference_tokens = [reference.split()] # ์ฐธ์กฐ๋ฌธ์ ๋ฆฌ์คํธ๋ก ๊ฐ์ธ๊ธฐ
candidate_tokens = candidate.split()
# Smoothing (์งง์ ๋ฌธ์ฅ ์ฒ๋ฆฌ)
smoothie = SmoothingFunction().method1
# BLEU ์ ์ (1-gram๋ถํฐ 4-gram๊น์ง)
bleu_1 = sentence_bleu(reference_tokens, candidate_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie)
bleu_2 = sentence_bleu(reference_tokens, candidate_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
bleu_4 = sentence_bleu(reference_tokens, candidate_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
return {
"bleu_1": bleu_1,
"bleu_2": bleu_2,
"bleu_4": bleu_4
}
# ์ฌ์ฉ
reference = "The cat sat on the mat"
candidate = "The cat is sitting on the mat"
scores = calculate_bleu(reference, candidate)
print(f"BLEU-1: {scores['bleu_1']:.4f}")
print(f"BLEU-4: {scores['bleu_4']:.4f}")
ROUGE (Recall-Oriented Understudy for Gisting Evaluation)
from rouge_score import rouge_scorer
def calculate_rouge(reference, candidate):
"""ROUGE ์ ์ ๊ณ์ฐ"""
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, candidate)
return {
"rouge1_f1": scores['rouge1'].fmeasure,
"rouge2_f1": scores['rouge2'].fmeasure,
"rougeL_f1": scores['rougeL'].fmeasure,
}
# ์ฌ์ฉ
reference = "The quick brown fox jumps over the lazy dog."
candidate = "A quick brown fox jumped over a lazy dog."
scores = calculate_rouge(reference, candidate)
print(f"ROUGE-1 F1: {scores['rouge1_f1']:.4f}")
print(f"ROUGE-2 F1: {scores['rouge2_f1']:.4f}")
print(f"ROUGE-L F1: {scores['rougeL_f1']:.4f}")
# ์ฝํผ์ค ๋ ๋ฒจ ํ๊ฐ
def corpus_rouge(references, candidates):
"""์ฝํผ์ค ์ ์ฒด ROUGE"""
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
totals = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
for ref, cand in zip(references, candidates):
scores = scorer.score(ref, cand)
totals['rouge1'] += scores['rouge1'].fmeasure
totals['rouge2'] += scores['rouge2'].fmeasure
totals['rougeL'] += scores['rougeL'].fmeasure
n = len(references)
return {k: v/n for k, v in totals.items()}
BERTScore
from bert_score import score
def calculate_bertscore(references, candidates, lang="en"):
"""BERTScore ๊ณ์ฐ (์๋ฏธ์ ์ ์ฌ๋)"""
P, R, F1 = score(candidates, references, lang=lang, verbose=True)
return {
"precision": P.mean().item(),
"recall": R.mean().item(),
"f1": F1.mean().item()
}
# ์ฌ์ฉ
references = ["The cat sat on the mat.", "It is raining outside."]
candidates = ["A cat is sitting on the mat.", "The weather is rainy."]
bert_scores = calculate_bertscore(references, candidates)
print(f"BERTScore F1: {bert_scores['f1']:.4f}")
์งํ ๋น๊ต
def compare_metrics(reference, candidate):
"""์ฌ๋ฌ ์งํ ๋น๊ต"""
results = {}
# BLEU
bleu = calculate_bleu(reference, candidate)
results["BLEU-4"] = bleu["bleu_4"]
# ROUGE
rouge = calculate_rouge(reference, candidate)
results["ROUGE-L"] = rouge["rougeL_f1"]
# BERTScore
P, R, F1 = score([candidate], [reference], lang="en")
results["BERTScore"] = F1.item()
return results
# ๋น๊ต
ref = "Machine learning is a subset of artificial intelligence."
cand1 = "ML is part of AI." # ์๋ฏธ์ ์ผ๋ก ์ ์ฌ
cand2 = "Machine learning is a subset of artificial intelligence." # ์์ ๋์ผ
print("ํ๋ณด 1 (์๋ฏธ์ ์ ์ฌ):")
print(compare_metrics(ref, cand1))
print("\nํ๋ณด 2 (์์ ๋์ผ):")
print(compare_metrics(ref, cand2))
3. ์ธ์ด ๋ชจ๋ธ ์งํ
Perplexity (๋นํน๋)
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
def calculate_perplexity(model, tokenizer, text, max_length=1024):
"""ํผํ๋ ์ํฐ ๊ณ์ฐ"""
encodings = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
max_length = model.config.n_positions if hasattr(model.config, "n_positions") else 1024
stride = 512
lls = []
for i in range(0, encodings.input_ids.size(1), stride):
begin_loc = max(i + stride - max_length, 0)
end_loc = min(i + stride, encodings.input_ids.size(1))
trg_len = end_loc - i
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(model.device)
target_ids = input_ids.clone()
target_ids[:, :-trg_len] = -100
with torch.no_grad():
outputs = model(input_ids, labels=target_ids)
log_likelihood = outputs.loss * trg_len
lls.append(log_likelihood)
ppl = torch.exp(torch.stack(lls).sum() / end_loc)
return ppl.item()
# ์ฌ์ฉ
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
text = "The quick brown fox jumps over the lazy dog."
ppl = calculate_perplexity(model, tokenizer, text)
print(f"Perplexity: {ppl:.2f}")
Token-level Accuracy
def token_accuracy(predictions, targets):
"""ํ ํฐ ๋ ๋ฒจ ์ ํ๋"""
correct = sum(p == t for p, t in zip(predictions, targets))
return correct / len(targets)
# ์์: ๋ค์ ํ ํฐ ์์ธก
predictions = [1, 2, 3, 4, 5]
targets = [1, 2, 0, 4, 5]
acc = token_accuracy(predictions, targets)
print(f"Token Accuracy: {acc:.2%}")
4. ์ฝ๋ ์์ฑ ํ๊ฐ
HumanEval (pass@k)
import subprocess
import tempfile
import os
from typing import List
def execute_code(code: str, test_cases: List[str], timeout: int = 5) -> bool:
"""์ฝ๋ ์คํ ๋ฐ ํ
์คํธ"""
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
f.write(code + "\n")
for test in test_cases:
f.write(test + "\n")
temp_path = f.name
try:
result = subprocess.run(
['python', temp_path],
capture_output=True,
text=True,
timeout=timeout
)
return result.returncode == 0
except subprocess.TimeoutExpired:
return False
finally:
os.unlink(temp_path)
def pass_at_k(n: int, c: int, k: int) -> float:
"""
pass@k ๊ณ์ฐ
n: ์์ฑ๋ ์ํ ์
c: ์ ๋ต ์ํ ์
k: k๊ฐ
"""
if n - c < k:
return 1.0
from math import comb
return 1.0 - comb(n - c, k) / comb(n, k)
# HumanEval ์คํ์ผ ํ๊ฐ
def evaluate_humaneval(model, tokenizer, problems, n_samples=10, k=[1, 10]):
"""HumanEval ํ๊ฐ"""
results = []
for problem in problems:
prompt = problem["prompt"]
test_cases = problem["test_cases"]
# n๊ฐ ์ํ ์์ฑ
correct = 0
for _ in range(n_samples):
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.8, do_sample=True)
code = tokenizer.decode(outputs[0], skip_special_tokens=True)
# ํ
์คํธ ์คํ
if execute_code(code, test_cases):
correct += 1
# pass@k ๊ณ์ฐ
pass_rates = {f"pass@{ki}": pass_at_k(n_samples, correct, ki) for ki in k}
results.append({"problem": problem["name"], **pass_rates})
return results
# ์์ ๋ฌธ์
problem = {
"name": "add_two_numbers",
"prompt": '''def add(a, b):
"""Return the sum of a and b."""
''',
"test_cases": [
"assert add(1, 2) == 3",
"assert add(-1, 1) == 0",
"assert add(0, 0) == 0"
]
}
MBPP (Mostly Basic Python Problems)
from datasets import load_dataset
def evaluate_mbpp(model, tokenizer, n_samples=1):
"""MBPP ๋ฒค์น๋งํฌ ํ๊ฐ"""
dataset = load_dataset("mbpp", split="test")
correct = 0
total = len(dataset)
for example in dataset:
prompt = f"""Write a Python function that {example['text']}
{example['code'].split('def')[0]}def"""
# ์ฝ๋ ์์ฑ
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.2)
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
# ํ
์คํธ
try:
full_code = generated + "\n" + "\n".join(example['test_list'])
exec(full_code)
correct += 1
except:
pass
return {"accuracy": correct / total}
5. LLM ๋ฒค์น๋งํฌ
MMLU (Massive Multitask Language Understanding)
from datasets import load_dataset
def evaluate_mmlu(model, tokenizer, subjects=None):
"""MMLU ๋ฒค์น๋งํฌ"""
dataset = load_dataset("cais/mmlu", "all", split="test")
if subjects:
dataset = dataset.filter(lambda x: x["subject"] in subjects)
results = {"correct": 0, "total": 0}
subject_results = {}
for example in dataset:
question = example["question"]
choices = example["choices"]
answer = example["answer"] # 0-3
subject = example["subject"]
# ํ๋กฌํํธ ๊ตฌ์ฑ
prompt = f"""Question: {question}
A. {choices[0]}
B. {choices[1]}
C. {choices[2]}
D. {choices[3]}
Answer:"""
# ์์ฑ
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=1, temperature=0)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# ์ ๋ต ํ์ธ
predicted = response.strip().upper()
correct_answer = ["A", "B", "C", "D"][answer]
is_correct = predicted == correct_answer
results["total"] += 1
if is_correct:
results["correct"] += 1
# ๊ณผ๋ชฉ๋ณ ์ง๊ณ
if subject not in subject_results:
subject_results[subject] = {"correct": 0, "total": 0}
subject_results[subject]["total"] += 1
if is_correct:
subject_results[subject]["correct"] += 1
# ์ ํ๋ ๊ณ์ฐ
results["accuracy"] = results["correct"] / results["total"]
for subject in subject_results:
s = subject_results[subject]
s["accuracy"] = s["correct"] / s["total"]
return results, subject_results
# ์ฌ์ฉ ์์
subjects = ["computer_science", "machine_learning", "mathematics"]
# results, by_subject = evaluate_mmlu(model, tokenizer, subjects)
TruthfulQA
from datasets import load_dataset
def evaluate_truthfulqa(model, tokenizer):
"""TruthfulQA ํ๊ฐ (์ง์ค์ฑ)"""
dataset = load_dataset("truthful_qa", "generation", split="validation")
results = []
for example in dataset:
question = example["question"]
best_answer = example["best_answer"]
correct_answers = example["correct_answers"]
incorrect_answers = example["incorrect_answers"]
# ์์ฑ
prompt = f"Question: {question}\nAnswer:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=100, temperature=0)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response.replace(prompt, "").strip()
# ํ๊ฐ (๊ฐ๋จํ ๋ฒ์ - ์ค์ ๋ก๋ GPT-judge ์ฌ์ฉ)
is_truthful = any(ans.lower() in response.lower() for ans in correct_answers)
is_informative = len(response) > 10 and "I don't know" not in response
results.append({
"question": question,
"response": response,
"truthful": is_truthful,
"informative": is_informative
})
truthful_rate = sum(r["truthful"] for r in results) / len(results)
informative_rate = sum(r["informative"] for r in results) / len(results)
return {
"truthful": truthful_rate,
"informative": informative_rate,
"combined": truthful_rate * informative_rate
}
HellaSwag (์์ ์ถ๋ก )
from datasets import load_dataset
def evaluate_hellaswag(model, tokenizer):
"""HellaSwag ํ๊ฐ"""
dataset = load_dataset("hellaswag", split="validation")
correct = 0
total = len(dataset)
for example in dataset:
context = example["ctx"]
endings = example["endings"]
label = int(example["label"])
# ๊ฐ ์ ํ์ง์ ๋ํ ํ๋ฅ ๊ณ์ฐ
scores = []
for ending in endings:
text = context + " " + ending
inputs = tokenizer(text, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model(**inputs, labels=inputs.input_ids)
scores.append(-outputs.loss.item()) # ๋ฎ์ loss = ๋์ ํ๋ฅ
predicted = scores.index(max(scores))
if predicted == label:
correct += 1
return {"accuracy": correct / total}
6. LLM-as-Judge ํ๊ฐ
GPT-4 ํ๊ฐ์
from openai import OpenAI
client = OpenAI()
def llm_judge(question, response_a, response_b):
"""LLM์ ์ฌ์ฉํ ์๋ต ๋น๊ต"""
judge_prompt = f"""๋ AI ์๋ต์ ๋น๊ตํ์ฌ ๋ ๋์ ๊ฒ์ ์ ํํ์ธ์.
์ง๋ฌธ: {question}
์๋ต A:
{response_a}
์๋ต B:
{response_b}
ํ๊ฐ ๊ธฐ์ค:
1. ์ ํ์ฑ: ์ ๋ณด๊ฐ ์ ํํ๊ฐ?
2. ์ ์ฉ์ฑ: ์ง๋ฌธ์ ์ ์ ํ ๋ต๋ณํ๋๊ฐ?
3. ๋ช
ํ์ฑ: ์ดํดํ๊ธฐ ์ฌ์ด๊ฐ?
4. ์์ ์ฑ: ์ถฉ๋ถํ ์์ธํ๊ฐ?
๋ถ์ ํ ๋ค์ ํ์์ผ๋ก ๋ตํ์ธ์:
๋ถ์: [๊ฐ ๊ธฐ์ค๋ณ ๋น๊ต]
์น์: [A ๋๋ B ๋๋ ๋์ ]
"""
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": judge_prompt}],
temperature=0
)
return response.choices[0].message.content
def pairwise_comparison(questions, model_a_responses, model_b_responses):
"""์๋ ๋น๊ต ํ๊ฐ"""
results = {"A_wins": 0, "B_wins": 0, "ties": 0}
for q, a, b in zip(questions, model_a_responses, model_b_responses):
judgment = llm_judge(q, a, b)
if "์น์: A" in judgment:
results["A_wins"] += 1
elif "์น์: B" in judgment:
results["B_wins"] += 1
else:
results["ties"] += 1
total = len(questions)
return {
"model_a_win_rate": results["A_wins"] / total,
"model_b_win_rate": results["B_wins"] / total,
"tie_rate": results["ties"] / total
}
๋ค์ฐจ์ ํ๊ฐ
def multidim_evaluation(question, response):
"""๋ค์ฐจ์ LLM ํ๊ฐ"""
eval_prompt = f"""๋ค์ AI ์๋ต์ ์ฌ๋ฌ ์ฐจ์์์ 1-5์ ์ผ๋ก ํ๊ฐํ์ธ์.
์ง๋ฌธ: {question}
์๋ต: {response}
๋ค์ ํ์์ผ๋ก JSON ์ถ๋ ฅ:
{{
"relevance": <1-5>,
"accuracy": <1-5>,
"helpfulness": <1-5>,
"coherence": <1-5>,
"safety": <1-5>,
"overall": <1-5>,
"explanation": "<ํ๊ฐ ์ด์ >"
}}
"""
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": eval_prompt}],
temperature=0,
response_format={"type": "json_object"}
)
import json
return json.loads(response.choices[0].message.content)
# ์ฌ์ฉ
scores = multidim_evaluation(
"์ธ๊ณต์ง๋ฅ์ด๋ ๋ฌด์์ธ๊ฐ์?",
"์ธ๊ณต์ง๋ฅ(AI)์ ์ปดํจํฐ ์์คํ
์ด ์ธ๊ฐ์ ์ง๋ฅ์ ๋ชจ๋ฐฉํ๋ ๊ธฐ์ ์
๋๋ค..."
)
print(scores)
7. ์ธ๊ฐ ํ๊ฐ
ํ๊ฐ ์ธํฐํ์ด์ค
import gradio as gr
def human_evaluation_interface():
"""์ธ๊ฐ ํ๊ฐ์ฉ Gradio ์ธํฐํ์ด์ค"""
def submit_evaluation(question, response, relevance, quality, safety, feedback):
# ๊ฒฐ๊ณผ ์ ์ฅ
result = {
"question": question,
"response": response,
"scores": {
"relevance": relevance,
"quality": quality,
"safety": safety
},
"feedback": feedback
}
# DB์ ์ ์ฅ ๋ฑ
return f"ํ๊ฐ ์ ์ฅ๋จ: {result}"
with gr.Blocks() as demo:
gr.Markdown("# LLM ์๋ต ํ๊ฐ")
with gr.Row():
question = gr.Textbox(label="์ง๋ฌธ")
response = gr.Textbox(label="AI ์๋ต", lines=5)
with gr.Row():
relevance = gr.Slider(1, 5, step=1, label="๊ด๋ จ์ฑ")
quality = gr.Slider(1, 5, step=1, label="ํ์ง")
safety = gr.Slider(1, 5, step=1, label="์์ ์ฑ")
feedback = gr.Textbox(label="์ถ๊ฐ ํผ๋๋ฐฑ", lines=3)
submit_btn = gr.Button("์ ์ถ")
result = gr.Textbox(label="๊ฒฐ๊ณผ")
submit_btn.click(
submit_evaluation,
inputs=[question, response, relevance, quality, safety, feedback],
outputs=[result]
)
return demo
# demo = human_evaluation_interface()
# demo.launch()
A/B ํ
์คํธ
import random
from dataclasses import dataclass
from typing import Optional
@dataclass
class ABTestResult:
question: str
response_a: str
response_b: str
chosen: str # "A" or "B"
evaluator_id: str
reason: Optional[str] = None
class ABTestManager:
def __init__(self):
self.results = []
def get_pair(self, question, model_a, model_b, tokenizer):
"""๋ฌด์์ ์์๋ก ๋ ์๋ต ๋ฐํ"""
# ์๋ต ์์ฑ
inputs = tokenizer(question, return_tensors="pt")
response_a = tokenizer.decode(model_a.generate(**inputs)[0])
response_b = tokenizer.decode(model_b.generate(**inputs)[0])
# ๋ฌด์์ ์์
if random.random() > 0.5:
return response_a, response_b, "A", "B"
else:
return response_b, response_a, "B", "A"
def record_result(self, result: ABTestResult):
self.results.append(result)
def analyze(self):
"""๊ฒฐ๊ณผ ๋ถ์"""
a_wins = sum(1 for r in self.results if r.chosen == "A")
b_wins = sum(1 for r in self.results if r.chosen == "B")
total = len(self.results)
return {
"model_a_win_rate": a_wins / total if total > 0 else 0,
"model_b_win_rate": b_wins / total if total > 0 else 0,
"total_evaluations": total
}
8. ํตํฉ ํ๊ฐ ํ๋ ์์ํฌ
lm-evaluation-harness
# ์ค์น
pip install lm-eval
# ์ฌ์ฉ
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks mmlu,hellaswag,truthfulqa \
--batch_size 8
์ปค์คํ
ํ๊ฐ ํด๋์ค
class LLMEvaluator:
"""ํตํฉ LLM ํ๊ฐ๊ธฐ"""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
self.results = {}
def evaluate_all(self, test_data):
"""์ ์ฒด ํ๊ฐ ์คํ"""
self.results = {
"perplexity": self._eval_perplexity(test_data["texts"]),
"rouge": self._eval_rouge(test_data["summaries"]),
"mmlu": self._eval_mmlu(test_data.get("mmlu_samples", [])),
"pass_at_k": self._eval_code(test_data.get("code_problems", [])),
}
return self.results
def _eval_perplexity(self, texts):
ppls = [calculate_perplexity(self.model, self.tokenizer, t) for t in texts]
return {"mean": sum(ppls) / len(ppls), "values": ppls}
def _eval_rouge(self, summaries):
scores = [calculate_rouge(s["reference"], s["candidate"]) for s in summaries]
return {
"rouge1": sum(s["rouge1_f1"] for s in scores) / len(scores),
"rougeL": sum(s["rougeL_f1"] for s in scores) / len(scores),
}
def _eval_mmlu(self, samples):
# MMLU ํ๊ฐ ๋ก์ง
pass
def _eval_code(self, problems):
# ์ฝ๋ ํ๊ฐ ๋ก์ง
pass
def generate_report(self):
"""ํ๊ฐ ๋ณด๊ณ ์ ์์ฑ"""
report = "# LLM Evaluation Report\n\n"
for metric, values in self.results.items():
report += f"## {metric.upper()}\n"
if isinstance(values, dict):
for k, v in values.items():
if isinstance(v, float):
report += f"- {k}: {v:.4f}\n"
report += "\n"
return report
์ ๋ฆฌ
ํ๊ฐ ์งํ ์ ํ ๊ฐ์ด๋
| ํ์คํฌ |
์ถ์ฒ ์งํ |
| ๋ฒ์ญ |
BLEU, COMET |
| ์์ฝ |
ROUGE, BERTScore |
| ๋ํ |
Human Eval, LLM-as-Judge |
| QA |
Exact Match, F1 |
| ์ฝ๋ ์์ฑ |
pass@k, MBPP |
| ์ผ๋ฐ ๋ฅ๋ ฅ |
MMLU, HellaSwag |
| ์ง์ค์ฑ |
TruthfulQA |
ํต์ฌ ์ฝ๋
# ROUGE
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'])
scores = scorer.score(reference, candidate)
# BERTScore
from bert_score import score
P, R, F1 = score(candidates, references, lang="en")
# pass@k
from math import comb
pass_k = 1.0 - comb(n - c, k) / comb(n, k)
# LLM-as-Judge
judgment = llm_judge(question, response_a, response_b)
ํ๊ฐ ์ฒดํฌ๋ฆฌ์คํธ
โก ํ์คํฌ์ ๋ง๋ ์งํ ์ ํ
โก ๋ค์ํ ํ๊ฐ ๋ฐฉ๋ฒ ์กฐํฉ (์๋ + ์ธ๊ฐ)
โก ์ถฉ๋ถํ ํ
์คํธ ์ํ ํ๋ณด
โก ํ๊ฐ์ ๊ฐ ์ผ์น๋ ํ์ธ
โก ๊ฒฐ๊ณผ ์ ๋ขฐ๊ตฌ๊ฐ ๊ณ์ฐ
โก ์ฌํ ๊ฐ๋ฅํ ํ๊ฐ ํ๊ฒฝ
ํ์ต ์๋ฃ
์ด๊ฒ์ผ๋ก LLM & NLP ์ฌํ ํ์ต์ ์๋ฃํ์ต๋๋ค!
์ ์ฒด ํ์ต ์์ฝ
- NLP ๊ธฐ์ด (01-03): ํ ํฐํ, ์๋ฒ ๋ฉ, Transformer
- ์ฌ์ ํ์ต ๋ชจ๋ธ (04-07): BERT, GPT, HuggingFace, ํ์ธํ๋
- LLM ํ์ฉ (08-12): ํ๋กฌํํธ, RAG, LangChain, ๋ฒกํฐ DB, ์ฑ๋ด
- LLM ์ฌํ (13-16): ์์ํ, RLHF, ์์ด์ ํธ, ํ๊ฐ
๋ค์ ๋จ๊ณ ์ถ์ฒ
- ์ค์ ํ๋ก์ ํธ์ ์ ์ฉ
- Kaggle NLP ๋ํ ์ฐธ๊ฐ
- ์ต์ LLM ๋
ผ๋ฌธ ์ฝ๊ธฐ (Claude, Gemini, Llama)
- ์คํ์์ค LLM ๊ธฐ์ฌ