16. LLM ํ‰๊ฐ€ ์ง€ํ‘œ (Evaluation Metrics)

16. LLM ํ‰๊ฐ€ ์ง€ํ‘œ (Evaluation Metrics)

ํ•™์Šต ๋ชฉํ‘œ

  • ํ…์ŠคํŠธ ์ƒ์„ฑ ํ‰๊ฐ€ ์ง€ํ‘œ ์ดํ•ด (BLEU, ROUGE, BERTScore)
  • ์ฝ”๋“œ ์ƒ์„ฑ ํ‰๊ฐ€ (HumanEval, MBPP)
  • LLM ๋ฒค์น˜๋งˆํฌ (MMLU, HellaSwag, TruthfulQA)
  • ์ธ๊ฐ„ ํ‰๊ฐ€์™€ ์ž๋™ ํ‰๊ฐ€

1. ํ‰๊ฐ€์˜ ์ค‘์š”์„ฑ

LLM ํ‰๊ฐ€์˜ ์–ด๋ ค์›€

โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚                   LLM ํ‰๊ฐ€์˜ ์–ด๋ ค์›€                          โ”‚
โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค
โ”‚                                                              โ”‚
โ”‚  1. ์ •๋‹ต์ด ์—ฌ๋Ÿฌ ๊ฐœ: ๊ฐ™์€ ์งˆ๋ฌธ์— ๋‹ค์–‘ํ•œ ์ •๋‹ต ๊ฐ€๋Šฅ              โ”‚
โ”‚                                                              โ”‚
โ”‚  2. ์ฃผ๊ด€์  ํ’ˆ์งˆ: "์ข‹์€" ์‘๋‹ต์˜ ๊ธฐ์ค€์ด ๋ชจํ˜ธ                   โ”‚
โ”‚                                                              โ”‚
โ”‚  3. ํƒœ์Šคํฌ ๋‹ค์–‘์„ฑ: ์š”์•ฝ, ๋Œ€ํ™”, ์ฝ”๋“œ, ์ถ”๋ก  ๋“ฑ ๋‹ค์–‘             โ”‚
โ”‚                                                              โ”‚
โ”‚  4. ์ง€์‹ ์‹œ์ : ํ•™์Šต ๋ฐ์ดํ„ฐ ๊ธฐ์ค€ ์‹œ์                          โ”‚
โ”‚                                                              โ”‚
โ”‚  5. ์•ˆ์ „์„ฑ: ์œ ํ•ด์„ฑ, ํŽธํ–ฅ, ํ™˜๊ฐ ์ธก์ •                          โ”‚
โ”‚                                                              โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜

ํ‰๊ฐ€ ์œ ํ˜•

ํ‰๊ฐ€ ์œ ํ˜• ์„ค๋ช… ์˜ˆ์‹œ
์ž๋™ ํ‰๊ฐ€ ์•Œ๊ณ ๋ฆฌ์ฆ˜ ๊ธฐ๋ฐ˜ ์ ์ˆ˜ BLEU, ROUGE, Perplexity
๋ชจ๋ธ ๊ธฐ๋ฐ˜ ํ‰๊ฐ€ LLM์ด ํ‰๊ฐ€ GPT-4 as Judge
์ธ๊ฐ„ ํ‰๊ฐ€ ์‚ฌ๋žŒ์ด ์ง์ ‘ ํ‰๊ฐ€ A/B ํ…Œ์ŠคํŠธ, ๋ฆฌ์ปคํŠธ ์ฒ™๋„
๋ฒค์น˜๋งˆํฌ ํ‘œ์ค€ํ™”๋œ ํ…Œ์ŠคํŠธ์…‹ MMLU, HumanEval

2. ํ…์ŠคํŠธ ์œ ์‚ฌ๋„ ์ง€ํ‘œ

BLEU (Bilingual Evaluation Understudy)

from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
import nltk
nltk.download('punkt')

def calculate_bleu(reference, candidate):
    """BLEU ์ ์ˆ˜ ๊ณ„์‚ฐ"""
    # ํ† ํฐํ™”
    reference_tokens = [reference.split()]  # ์ฐธ์กฐ๋ฌธ์€ ๋ฆฌ์ŠคํŠธ๋กœ ๊ฐ์‹ธ๊ธฐ
    candidate_tokens = candidate.split()

    # Smoothing (์งง์€ ๋ฌธ์žฅ ์ฒ˜๋ฆฌ)
    smoothie = SmoothingFunction().method1

    # BLEU ์ ์ˆ˜ (1-gram๋ถ€ํ„ฐ 4-gram๊นŒ์ง€)
    bleu_1 = sentence_bleu(reference_tokens, candidate_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu_2 = sentence_bleu(reference_tokens, candidate_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
    bleu_4 = sentence_bleu(reference_tokens, candidate_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)

    return {
        "bleu_1": bleu_1,
        "bleu_2": bleu_2,
        "bleu_4": bleu_4
    }

# ์‚ฌ์šฉ
reference = "The cat sat on the mat"
candidate = "The cat is sitting on the mat"
scores = calculate_bleu(reference, candidate)
print(f"BLEU-1: {scores['bleu_1']:.4f}")
print(f"BLEU-4: {scores['bleu_4']:.4f}")

ROUGE (Recall-Oriented Understudy for Gisting Evaluation)

from rouge_score import rouge_scorer

def calculate_rouge(reference, candidate):
    """ROUGE ์ ์ˆ˜ ๊ณ„์‚ฐ"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)

    return {
        "rouge1_f1": scores['rouge1'].fmeasure,
        "rouge2_f1": scores['rouge2'].fmeasure,
        "rougeL_f1": scores['rougeL'].fmeasure,
    }

# ์‚ฌ์šฉ
reference = "The quick brown fox jumps over the lazy dog."
candidate = "A quick brown fox jumped over a lazy dog."
scores = calculate_rouge(reference, candidate)
print(f"ROUGE-1 F1: {scores['rouge1_f1']:.4f}")
print(f"ROUGE-2 F1: {scores['rouge2_f1']:.4f}")
print(f"ROUGE-L F1: {scores['rougeL_f1']:.4f}")

# ์ฝ”ํผ์Šค ๋ ˆ๋ฒจ ํ‰๊ฐ€
def corpus_rouge(references, candidates):
    """์ฝ”ํผ์Šค ์ „์ฒด ROUGE"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    totals = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    for ref, cand in zip(references, candidates):
        scores = scorer.score(ref, cand)
        totals['rouge1'] += scores['rouge1'].fmeasure
        totals['rouge2'] += scores['rouge2'].fmeasure
        totals['rougeL'] += scores['rougeL'].fmeasure

    n = len(references)
    return {k: v/n for k, v in totals.items()}

BERTScore

from bert_score import score

def calculate_bertscore(references, candidates, lang="en"):
    """BERTScore ๊ณ„์‚ฐ (์˜๋ฏธ์  ์œ ์‚ฌ๋„)"""
    P, R, F1 = score(candidates, references, lang=lang, verbose=True)

    return {
        "precision": P.mean().item(),
        "recall": R.mean().item(),
        "f1": F1.mean().item()
    }

# ์‚ฌ์šฉ
references = ["The cat sat on the mat.", "It is raining outside."]
candidates = ["A cat is sitting on the mat.", "The weather is rainy."]

bert_scores = calculate_bertscore(references, candidates)
print(f"BERTScore F1: {bert_scores['f1']:.4f}")

์ง€ํ‘œ ๋น„๊ต

def compare_metrics(reference, candidate):
    """์—ฌ๋Ÿฌ ์ง€ํ‘œ ๋น„๊ต"""
    results = {}

    # BLEU
    bleu = calculate_bleu(reference, candidate)
    results["BLEU-4"] = bleu["bleu_4"]

    # ROUGE
    rouge = calculate_rouge(reference, candidate)
    results["ROUGE-L"] = rouge["rougeL_f1"]

    # BERTScore
    P, R, F1 = score([candidate], [reference], lang="en")
    results["BERTScore"] = F1.item()

    return results

# ๋น„๊ต
ref = "Machine learning is a subset of artificial intelligence."
cand1 = "ML is part of AI."  # ์˜๋ฏธ์ ์œผ๋กœ ์œ ์‚ฌ
cand2 = "Machine learning is a subset of artificial intelligence."  # ์™„์ „ ๋™์ผ

print("ํ›„๋ณด 1 (์˜๋ฏธ์  ์œ ์‚ฌ):")
print(compare_metrics(ref, cand1))

print("\nํ›„๋ณด 2 (์™„์ „ ๋™์ผ):")
print(compare_metrics(ref, cand2))

3. ์–ธ์–ด ๋ชจ๋ธ ์ง€ํ‘œ

Perplexity (๋‹นํ˜น๋„)

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def calculate_perplexity(model, tokenizer, text, max_length=1024):
    """ํผํ”Œ๋ ‰์‹œํ‹ฐ ๊ณ„์‚ฐ"""
    encodings = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)

    max_length = model.config.n_positions if hasattr(model.config, "n_positions") else 1024
    stride = 512

    lls = []
    for i in range(0, encodings.input_ids.size(1), stride):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, encodings.input_ids.size(1))
        trg_len = end_loc - i

        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(model.device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            log_likelihood = outputs.loss * trg_len

        lls.append(log_likelihood)

    ppl = torch.exp(torch.stack(lls).sum() / end_loc)
    return ppl.item()

# ์‚ฌ์šฉ
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

text = "The quick brown fox jumps over the lazy dog."
ppl = calculate_perplexity(model, tokenizer, text)
print(f"Perplexity: {ppl:.2f}")

Token-level Accuracy

def token_accuracy(predictions, targets):
    """ํ† ํฐ ๋ ˆ๋ฒจ ์ •ํ™•๋„"""
    correct = sum(p == t for p, t in zip(predictions, targets))
    return correct / len(targets)

# ์˜ˆ์‹œ: ๋‹ค์Œ ํ† ํฐ ์˜ˆ์ธก
predictions = [1, 2, 3, 4, 5]
targets = [1, 2, 0, 4, 5]
acc = token_accuracy(predictions, targets)
print(f"Token Accuracy: {acc:.2%}")

4. ์ฝ”๋“œ ์ƒ์„ฑ ํ‰๊ฐ€

HumanEval (pass@k)

import subprocess
import tempfile
import os
from typing import List

def execute_code(code: str, test_cases: List[str], timeout: int = 5) -> bool:
    """์ฝ”๋“œ ์‹คํ–‰ ๋ฐ ํ…Œ์ŠคํŠธ"""
    with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
        f.write(code + "\n")
        for test in test_cases:
            f.write(test + "\n")
        temp_path = f.name

    try:
        result = subprocess.run(
            ['python', temp_path],
            capture_output=True,
            text=True,
            timeout=timeout
        )
        return result.returncode == 0
    except subprocess.TimeoutExpired:
        return False
    finally:
        os.unlink(temp_path)

def pass_at_k(n: int, c: int, k: int) -> float:
    """
    pass@k ๊ณ„์‚ฐ
    n: ์ƒ์„ฑ๋œ ์ƒ˜ํ”Œ ์ˆ˜
    c: ์ •๋‹ต ์ƒ˜ํ”Œ ์ˆ˜
    k: k๊ฐ’
    """
    if n - c < k:
        return 1.0

    from math import comb
    return 1.0 - comb(n - c, k) / comb(n, k)

# HumanEval ์Šคํƒ€์ผ ํ‰๊ฐ€
def evaluate_humaneval(model, tokenizer, problems, n_samples=10, k=[1, 10]):
    """HumanEval ํ‰๊ฐ€"""
    results = []

    for problem in problems:
        prompt = problem["prompt"]
        test_cases = problem["test_cases"]

        # n๊ฐœ ์ƒ˜ํ”Œ ์ƒ์„ฑ
        correct = 0
        for _ in range(n_samples):
            inputs = tokenizer(prompt, return_tensors="pt")
            outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.8, do_sample=True)
            code = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # ํ…Œ์ŠคํŠธ ์‹คํ–‰
            if execute_code(code, test_cases):
                correct += 1

        # pass@k ๊ณ„์‚ฐ
        pass_rates = {f"pass@{ki}": pass_at_k(n_samples, correct, ki) for ki in k}
        results.append({"problem": problem["name"], **pass_rates})

    return results

# ์˜ˆ์ œ ๋ฌธ์ œ
problem = {
    "name": "add_two_numbers",
    "prompt": '''def add(a, b):
    """Return the sum of a and b."""
''',
    "test_cases": [
        "assert add(1, 2) == 3",
        "assert add(-1, 1) == 0",
        "assert add(0, 0) == 0"
    ]
}

MBPP (Mostly Basic Python Problems)

from datasets import load_dataset

def evaluate_mbpp(model, tokenizer, n_samples=1):
    """MBPP ๋ฒค์น˜๋งˆํฌ ํ‰๊ฐ€"""
    dataset = load_dataset("mbpp", split="test")

    correct = 0
    total = len(dataset)

    for example in dataset:
        prompt = f"""Write a Python function that {example['text']}

{example['code'].split('def')[0]}def"""

        # ์ฝ”๋“œ ์ƒ์„ฑ
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.2)
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # ํ…Œ์ŠคํŠธ
        try:
            full_code = generated + "\n" + "\n".join(example['test_list'])
            exec(full_code)
            correct += 1
        except:
            pass

    return {"accuracy": correct / total}

5. LLM ๋ฒค์น˜๋งˆํฌ

MMLU (Massive Multitask Language Understanding)

from datasets import load_dataset

def evaluate_mmlu(model, tokenizer, subjects=None):
    """MMLU ๋ฒค์น˜๋งˆํฌ"""
    dataset = load_dataset("cais/mmlu", "all", split="test")

    if subjects:
        dataset = dataset.filter(lambda x: x["subject"] in subjects)

    results = {"correct": 0, "total": 0}
    subject_results = {}

    for example in dataset:
        question = example["question"]
        choices = example["choices"]
        answer = example["answer"]  # 0-3
        subject = example["subject"]

        # ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
        prompt = f"""Question: {question}

A. {choices[0]}
B. {choices[1]}
C. {choices[2]}
D. {choices[3]}

Answer:"""

        # ์ƒ์„ฑ
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=1, temperature=0)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # ์ •๋‹ต ํ™•์ธ
        predicted = response.strip().upper()
        correct_answer = ["A", "B", "C", "D"][answer]

        is_correct = predicted == correct_answer
        results["total"] += 1
        if is_correct:
            results["correct"] += 1

        # ๊ณผ๋ชฉ๋ณ„ ์ง‘๊ณ„
        if subject not in subject_results:
            subject_results[subject] = {"correct": 0, "total": 0}
        subject_results[subject]["total"] += 1
        if is_correct:
            subject_results[subject]["correct"] += 1

    # ์ •ํ™•๋„ ๊ณ„์‚ฐ
    results["accuracy"] = results["correct"] / results["total"]
    for subject in subject_results:
        s = subject_results[subject]
        s["accuracy"] = s["correct"] / s["total"]

    return results, subject_results

# ์‚ฌ์šฉ ์˜ˆ์‹œ
subjects = ["computer_science", "machine_learning", "mathematics"]
# results, by_subject = evaluate_mmlu(model, tokenizer, subjects)

TruthfulQA

from datasets import load_dataset

def evaluate_truthfulqa(model, tokenizer):
    """TruthfulQA ํ‰๊ฐ€ (์ง„์‹ค์„ฑ)"""
    dataset = load_dataset("truthful_qa", "generation", split="validation")

    results = []

    for example in dataset:
        question = example["question"]
        best_answer = example["best_answer"]
        correct_answers = example["correct_answers"]
        incorrect_answers = example["incorrect_answers"]

        # ์ƒ์„ฑ
        prompt = f"Question: {question}\nAnswer:"
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=100, temperature=0)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response.replace(prompt, "").strip()

        # ํ‰๊ฐ€ (๊ฐ„๋‹จํ•œ ๋ฒ„์ „ - ์‹ค์ œ๋กœ๋Š” GPT-judge ์‚ฌ์šฉ)
        is_truthful = any(ans.lower() in response.lower() for ans in correct_answers)
        is_informative = len(response) > 10 and "I don't know" not in response

        results.append({
            "question": question,
            "response": response,
            "truthful": is_truthful,
            "informative": is_informative
        })

    truthful_rate = sum(r["truthful"] for r in results) / len(results)
    informative_rate = sum(r["informative"] for r in results) / len(results)

    return {
        "truthful": truthful_rate,
        "informative": informative_rate,
        "combined": truthful_rate * informative_rate
    }

HellaSwag (์ƒ์‹ ์ถ”๋ก )

from datasets import load_dataset

def evaluate_hellaswag(model, tokenizer):
    """HellaSwag ํ‰๊ฐ€"""
    dataset = load_dataset("hellaswag", split="validation")

    correct = 0
    total = len(dataset)

    for example in dataset:
        context = example["ctx"]
        endings = example["endings"]
        label = int(example["label"])

        # ๊ฐ ์„ ํƒ์ง€์— ๋Œ€ํ•œ ํ™•๋ฅ  ๊ณ„์‚ฐ
        scores = []
        for ending in endings:
            text = context + " " + ending
            inputs = tokenizer(text, return_tensors="pt").to(model.device)

            with torch.no_grad():
                outputs = model(**inputs, labels=inputs.input_ids)
                scores.append(-outputs.loss.item())  # ๋‚ฎ์€ loss = ๋†’์€ ํ™•๋ฅ 

        predicted = scores.index(max(scores))
        if predicted == label:
            correct += 1

    return {"accuracy": correct / total}

6. LLM-as-Judge ํ‰๊ฐ€

GPT-4 ํ‰๊ฐ€์ž

from openai import OpenAI

client = OpenAI()

def llm_judge(question, response_a, response_b):
    """LLM์„ ์‚ฌ์šฉํ•œ ์‘๋‹ต ๋น„๊ต"""
    judge_prompt = f"""๋‘ AI ์‘๋‹ต์„ ๋น„๊ตํ•˜์—ฌ ๋” ๋‚˜์€ ๊ฒƒ์„ ์„ ํƒํ•˜์„ธ์š”.

์งˆ๋ฌธ: {question}

์‘๋‹ต A:
{response_a}

์‘๋‹ต B:
{response_b}

ํ‰๊ฐ€ ๊ธฐ์ค€:
1. ์ •ํ™•์„ฑ: ์ •๋ณด๊ฐ€ ์ •ํ™•ํ•œ๊ฐ€?
2. ์œ ์šฉ์„ฑ: ์งˆ๋ฌธ์— ์ ์ ˆํžˆ ๋‹ต๋ณ€ํ–ˆ๋Š”๊ฐ€?
3. ๋ช…ํ™•์„ฑ: ์ดํ•ดํ•˜๊ธฐ ์‰ฌ์šด๊ฐ€?
4. ์™„์ „์„ฑ: ์ถฉ๋ถ„ํžˆ ์ƒ์„ธํ•œ๊ฐ€?

๋ถ„์„ ํ›„ ๋‹ค์Œ ํ˜•์‹์œผ๋กœ ๋‹ตํ•˜์„ธ์š”:
๋ถ„์„: [๊ฐ ๊ธฐ์ค€๋ณ„ ๋น„๊ต]
์Šน์ž: [A ๋˜๋Š” B ๋˜๋Š” ๋™์ ]
"""

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": judge_prompt}],
        temperature=0
    )

    return response.choices[0].message.content

def pairwise_comparison(questions, model_a_responses, model_b_responses):
    """์Œ๋Œ€ ๋น„๊ต ํ‰๊ฐ€"""
    results = {"A_wins": 0, "B_wins": 0, "ties": 0}

    for q, a, b in zip(questions, model_a_responses, model_b_responses):
        judgment = llm_judge(q, a, b)

        if "์Šน์ž: A" in judgment:
            results["A_wins"] += 1
        elif "์Šน์ž: B" in judgment:
            results["B_wins"] += 1
        else:
            results["ties"] += 1

    total = len(questions)
    return {
        "model_a_win_rate": results["A_wins"] / total,
        "model_b_win_rate": results["B_wins"] / total,
        "tie_rate": results["ties"] / total
    }

๋‹ค์ฐจ์› ํ‰๊ฐ€

def multidim_evaluation(question, response):
    """๋‹ค์ฐจ์› LLM ํ‰๊ฐ€"""
    eval_prompt = f"""๋‹ค์Œ AI ์‘๋‹ต์„ ์—ฌ๋Ÿฌ ์ฐจ์›์—์„œ 1-5์ ์œผ๋กœ ํ‰๊ฐ€ํ•˜์„ธ์š”.

์งˆ๋ฌธ: {question}

์‘๋‹ต: {response}

๋‹ค์Œ ํ˜•์‹์œผ๋กœ JSON ์ถœ๋ ฅ:
{{
    "relevance": <1-5>,
    "accuracy": <1-5>,
    "helpfulness": <1-5>,
    "coherence": <1-5>,
    "safety": <1-5>,
    "overall": <1-5>,
    "explanation": "<ํ‰๊ฐ€ ์ด์œ >"
}}
"""

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": eval_prompt}],
        temperature=0,
        response_format={"type": "json_object"}
    )

    import json
    return json.loads(response.choices[0].message.content)

# ์‚ฌ์šฉ
scores = multidim_evaluation(
    "์ธ๊ณต์ง€๋Šฅ์ด๋ž€ ๋ฌด์—‡์ธ๊ฐ€์š”?",
    "์ธ๊ณต์ง€๋Šฅ(AI)์€ ์ปดํ“จํ„ฐ ์‹œ์Šคํ…œ์ด ์ธ๊ฐ„์˜ ์ง€๋Šฅ์„ ๋ชจ๋ฐฉํ•˜๋Š” ๊ธฐ์ˆ ์ž…๋‹ˆ๋‹ค..."
)
print(scores)

7. ์ธ๊ฐ„ ํ‰๊ฐ€

ํ‰๊ฐ€ ์ธํ„ฐํŽ˜์ด์Šค

import gradio as gr

def human_evaluation_interface():
    """์ธ๊ฐ„ ํ‰๊ฐ€์šฉ Gradio ์ธํ„ฐํŽ˜์ด์Šค"""

    def submit_evaluation(question, response, relevance, quality, safety, feedback):
        # ๊ฒฐ๊ณผ ์ €์žฅ
        result = {
            "question": question,
            "response": response,
            "scores": {
                "relevance": relevance,
                "quality": quality,
                "safety": safety
            },
            "feedback": feedback
        }
        # DB์— ์ €์žฅ ๋“ฑ
        return f"ํ‰๊ฐ€ ์ €์žฅ๋จ: {result}"

    with gr.Blocks() as demo:
        gr.Markdown("# LLM ์‘๋‹ต ํ‰๊ฐ€")

        with gr.Row():
            question = gr.Textbox(label="์งˆ๋ฌธ")
            response = gr.Textbox(label="AI ์‘๋‹ต", lines=5)

        with gr.Row():
            relevance = gr.Slider(1, 5, step=1, label="๊ด€๋ จ์„ฑ")
            quality = gr.Slider(1, 5, step=1, label="ํ’ˆ์งˆ")
            safety = gr.Slider(1, 5, step=1, label="์•ˆ์ „์„ฑ")

        feedback = gr.Textbox(label="์ถ”๊ฐ€ ํ”ผ๋“œ๋ฐฑ", lines=3)
        submit_btn = gr.Button("์ œ์ถœ")
        result = gr.Textbox(label="๊ฒฐ๊ณผ")

        submit_btn.click(
            submit_evaluation,
            inputs=[question, response, relevance, quality, safety, feedback],
            outputs=[result]
        )

    return demo

# demo = human_evaluation_interface()
# demo.launch()

A/B ํ…Œ์ŠคํŠธ

import random
from dataclasses import dataclass
from typing import Optional

@dataclass
class ABTestResult:
    question: str
    response_a: str
    response_b: str
    chosen: str  # "A" or "B"
    evaluator_id: str
    reason: Optional[str] = None

class ABTestManager:
    def __init__(self):
        self.results = []

    def get_pair(self, question, model_a, model_b, tokenizer):
        """๋ฌด์ž‘์œ„ ์ˆœ์„œ๋กœ ๋‘ ์‘๋‹ต ๋ฐ˜ํ™˜"""
        # ์‘๋‹ต ์ƒ์„ฑ
        inputs = tokenizer(question, return_tensors="pt")
        response_a = tokenizer.decode(model_a.generate(**inputs)[0])
        response_b = tokenizer.decode(model_b.generate(**inputs)[0])

        # ๋ฌด์ž‘์œ„ ์ˆœ์„œ
        if random.random() > 0.5:
            return response_a, response_b, "A", "B"
        else:
            return response_b, response_a, "B", "A"

    def record_result(self, result: ABTestResult):
        self.results.append(result)

    def analyze(self):
        """๊ฒฐ๊ณผ ๋ถ„์„"""
        a_wins = sum(1 for r in self.results if r.chosen == "A")
        b_wins = sum(1 for r in self.results if r.chosen == "B")
        total = len(self.results)

        return {
            "model_a_win_rate": a_wins / total if total > 0 else 0,
            "model_b_win_rate": b_wins / total if total > 0 else 0,
            "total_evaluations": total
        }

8. ํ†ตํ•ฉ ํ‰๊ฐ€ ํ”„๋ ˆ์ž„์›Œํฌ

lm-evaluation-harness

# ์„ค์น˜
pip install lm-eval

# ์‚ฌ์šฉ
lm_eval --model hf \
    --model_args pretrained=meta-llama/Llama-2-7b-hf \
    --tasks mmlu,hellaswag,truthfulqa \
    --batch_size 8

์ปค์Šคํ…€ ํ‰๊ฐ€ ํด๋ž˜์Šค

class LLMEvaluator:
    """ํ†ตํ•ฉ LLM ํ‰๊ฐ€๊ธฐ"""

    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.results = {}

    def evaluate_all(self, test_data):
        """์ „์ฒด ํ‰๊ฐ€ ์‹คํ–‰"""
        self.results = {
            "perplexity": self._eval_perplexity(test_data["texts"]),
            "rouge": self._eval_rouge(test_data["summaries"]),
            "mmlu": self._eval_mmlu(test_data.get("mmlu_samples", [])),
            "pass_at_k": self._eval_code(test_data.get("code_problems", [])),
        }
        return self.results

    def _eval_perplexity(self, texts):
        ppls = [calculate_perplexity(self.model, self.tokenizer, t) for t in texts]
        return {"mean": sum(ppls) / len(ppls), "values": ppls}

    def _eval_rouge(self, summaries):
        scores = [calculate_rouge(s["reference"], s["candidate"]) for s in summaries]
        return {
            "rouge1": sum(s["rouge1_f1"] for s in scores) / len(scores),
            "rougeL": sum(s["rougeL_f1"] for s in scores) / len(scores),
        }

    def _eval_mmlu(self, samples):
        # MMLU ํ‰๊ฐ€ ๋กœ์ง
        pass

    def _eval_code(self, problems):
        # ์ฝ”๋“œ ํ‰๊ฐ€ ๋กœ์ง
        pass

    def generate_report(self):
        """ํ‰๊ฐ€ ๋ณด๊ณ ์„œ ์ƒ์„ฑ"""
        report = "# LLM Evaluation Report\n\n"

        for metric, values in self.results.items():
            report += f"## {metric.upper()}\n"
            if isinstance(values, dict):
                for k, v in values.items():
                    if isinstance(v, float):
                        report += f"- {k}: {v:.4f}\n"
            report += "\n"

        return report

์ •๋ฆฌ

ํ‰๊ฐ€ ์ง€ํ‘œ ์„ ํƒ ๊ฐ€์ด๋“œ

ํƒœ์Šคํฌ ์ถ”์ฒœ ์ง€ํ‘œ
๋ฒˆ์—ญ BLEU, COMET
์š”์•ฝ ROUGE, BERTScore
๋Œ€ํ™” Human Eval, LLM-as-Judge
QA Exact Match, F1
์ฝ”๋“œ ์ƒ์„ฑ pass@k, MBPP
์ผ๋ฐ˜ ๋Šฅ๋ ฅ MMLU, HellaSwag
์ง„์‹ค์„ฑ TruthfulQA

ํ•ต์‹ฌ ์ฝ”๋“œ

# ROUGE
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'])
scores = scorer.score(reference, candidate)

# BERTScore
from bert_score import score
P, R, F1 = score(candidates, references, lang="en")

# pass@k
from math import comb
pass_k = 1.0 - comb(n - c, k) / comb(n, k)

# LLM-as-Judge
judgment = llm_judge(question, response_a, response_b)

ํ‰๊ฐ€ ์ฒดํฌ๋ฆฌ์ŠคํŠธ

โ–ก ํƒœ์Šคํฌ์— ๋งž๋Š” ์ง€ํ‘œ ์„ ํƒ
โ–ก ๋‹ค์–‘ํ•œ ํ‰๊ฐ€ ๋ฐฉ๋ฒ• ์กฐํ•ฉ (์ž๋™ + ์ธ๊ฐ„)
โ–ก ์ถฉ๋ถ„ํ•œ ํ…Œ์ŠคํŠธ ์ƒ˜ํ”Œ ํ™•๋ณด
โ–ก ํ‰๊ฐ€์ž ๊ฐ„ ์ผ์น˜๋„ ํ™•์ธ
โ–ก ๊ฒฐ๊ณผ ์‹ ๋ขฐ๊ตฌ๊ฐ„ ๊ณ„์‚ฐ
โ–ก ์žฌํ˜„ ๊ฐ€๋Šฅํ•œ ํ‰๊ฐ€ ํ™˜๊ฒฝ

ํ•™์Šต ์™„๋ฃŒ

์ด๊ฒƒ์œผ๋กœ LLM & NLP ์‹ฌํ™” ํ•™์Šต์„ ์™„๋ฃŒํ–ˆ์Šต๋‹ˆ๋‹ค!

์ „์ฒด ํ•™์Šต ์š”์•ฝ

  1. NLP ๊ธฐ์ดˆ (01-03): ํ† ํฐํ™”, ์ž„๋ฒ ๋”ฉ, Transformer
  2. ์‚ฌ์ „ํ•™์Šต ๋ชจ๋ธ (04-07): BERT, GPT, HuggingFace, ํŒŒ์ธํŠœ๋‹
  3. LLM ํ™œ์šฉ (08-12): ํ”„๋กฌํ”„ํŠธ, RAG, LangChain, ๋ฒกํ„ฐ DB, ์ฑ—๋ด‡
  4. LLM ์‹ฌํ™” (13-16): ์–‘์žํ™”, RLHF, ์—์ด์ „ํŠธ, ํ‰๊ฐ€

๋‹ค์Œ ๋‹จ๊ณ„ ์ถ”์ฒœ

  • ์‹ค์ œ ํ”„๋กœ์ ํŠธ์— ์ ์šฉ
  • Kaggle NLP ๋Œ€ํšŒ ์ฐธ๊ฐ€
  • ์ตœ์‹  LLM ๋…ผ๋ฌธ ์ฝ๊ธฐ (Claude, Gemini, Llama)
  • ์˜คํ”ˆ์†Œ์Šค LLM ๊ธฐ์—ฌ
to navigate between lessons