14_rlhf_reward_model.py

  1"""
  214. RLHF와 LLM 정렬 (Alignment) 예제
  3
  4Reward Model, PPO, DPO, Constitutional AI 실습
  5"""
  6
  7import numpy as np
  8import random
  9
 10print("=" * 60)
 11print("RLHF와 LLM 정렬 (Alignment)")
 12print("=" * 60)
 13
 14
 15# ============================================
 16# 1. 선호도 데이터 이해
 17# ============================================
 18print("\n[1] 선호도 데이터 형식")
 19print("-" * 40)
 20
 21# 선호도 데이터 예시
 22preference_data = [
 23    {
 24        "prompt": "인공지능이란 무엇인가요?",
 25        "chosen": "인공지능(AI)은 컴퓨터 시스템이 인간의 지능을 모방하여 학습, 추론, "
 26                  "문제 해결 등의 작업을 수행하는 기술입니다. 머신러닝, 딥러닝, "
 27                  "자연어 처리 등 다양한 분야를 포함합니다.",
 28        "rejected": "AI는 컴퓨터가 똑똑해지는 것입니다."
 29    },
 30    {
 31        "prompt": "파이썬의 장점은?",
 32        "chosen": "파이썬의 주요 장점은 1) 읽기 쉬운 문법, 2) 풍부한 라이브러리, "
 33                  "3) 다양한 분야 적용 가능, 4) 활발한 커뮤니티입니다.",
 34        "rejected": "파이썬은 좋은 언어입니다."
 35    },
 36    {
 37        "prompt": "운동의 효과는?",
 38        "chosen": "규칙적인 운동은 심혈관 건강 개선, 체중 관리, 근력 강화, "
 39                  "정신 건강 향상, 수면 질 개선 등 다양한 효과가 있습니다.",
 40        "rejected": "운동하면 건강해집니다."
 41    }
 42]
 43
 44print("선호도 데이터 예시:")
 45for i, data in enumerate(preference_data):
 46    print(f"\n{i+1}. 프롬프트: {data['prompt']}")
 47    print(f"   선호 응답: {data['chosen'][:50]}...")
 48    print(f"   비선호 응답: {data['rejected']}")
 49
 50
 51# ============================================
 52# 2. 간단한 Reward Model 시뮬레이션
 53# ============================================
 54print("\n[2] 간단한 Reward Model")
 55print("-" * 40)
 56
 57class SimpleRewardModel:
 58    """간단한 규칙 기반 Reward Model (시뮬레이션용)"""
 59
 60    def __init__(self):
 61        self.positive_factors = {
 62            "length": 0.3,        # 적절한 길이
 63            "detail": 0.3,        # 상세함
 64            "structure": 0.2,     # 구조화
 65            "politeness": 0.2     # 정중함
 66        }
 67
 68    def compute_reward(self, prompt, response):
 69        """응답에 대한 보상 점수 계산"""
 70        score = 0.0
 71
 72        # 1. 길이 점수 (50-300자 최적)
 73        length = len(response)
 74        if 50 <= length <= 300:
 75            score += self.positive_factors["length"]
 76        elif length > 300:
 77            score += self.positive_factors["length"] * 0.5
 78
 79        # 2. 상세함 (숫자, 예시 포함)
 80        if any(c.isdigit() for c in response):
 81            score += self.positive_factors["detail"] * 0.5
 82        if "예를 들어" in response or "예시" in response:
 83            score += self.positive_factors["detail"] * 0.5
 84
 85        # 3. 구조화 (쉼표, 마침표 사용)
 86        if response.count(',') >= 2:
 87            score += self.positive_factors["structure"]
 88
 89        # 4. 정중함
 90        polite_words = ["입니다", "습니다", "됩니다"]
 91        if any(word in response for word in polite_words):
 92            score += self.positive_factors["politeness"]
 93
 94        return score
 95
 96# 테스트
 97reward_model = SimpleRewardModel()
 98
 99print("Reward Model 테스트:")
100for data in preference_data:
101    chosen_reward = reward_model.compute_reward(data["prompt"], data["chosen"])
102    rejected_reward = reward_model.compute_reward(data["prompt"], data["rejected"])
103    print(f"\n프롬프트: {data['prompt']}")
104    print(f"  선호 응답 점수: {chosen_reward:.2f}")
105    print(f"  비선호 응답 점수: {rejected_reward:.2f}")
106    print(f"  정렬 여부: {'OK' if chosen_reward > rejected_reward else 'FAIL'}")
107
108
109# ============================================
110# 3. Bradley-Terry 모델 (DPO 기반)
111# ============================================
112print("\n[3] Bradley-Terry 모델 (선호도 확률)")
113print("-" * 40)
114
115def bradley_terry_probability(reward_chosen, reward_rejected, beta=1.0):
116    """
117    Bradley-Terry 모델로 선호 확률 계산
118
119    P(chosen > rejected) = sigmoid(beta * (r_chosen - r_rejected))
120    """
121    diff = reward_chosen - reward_rejected
122    prob = 1 / (1 + np.exp(-beta * diff))
123    return prob
124
125def dpo_loss(reward_chosen, reward_rejected, beta=0.1):
126    """
127    DPO 손실 함수 (간단한 버전)
128
129    L = -log(sigmoid(beta * (r_chosen - r_rejected)))
130    """
131    prob = bradley_terry_probability(reward_chosen, reward_rejected, beta)
132    loss = -np.log(prob + 1e-10)
133    return loss
134
135# 테스트
136print("Bradley-Terry 선호 확률:")
137for r_c, r_r in [(0.8, 0.3), (0.5, 0.5), (0.3, 0.7)]:
138    prob = bradley_terry_probability(r_c, r_r, beta=2.0)
139    loss = dpo_loss(r_c, r_r, beta=2.0)
140    print(f"  r_chosen={r_c}, r_rejected={r_r} -> P(chosen)={prob:.4f}, Loss={loss:.4f}")
141
142
143# ============================================
144# 4. PPO 개념 시뮬레이션
145# ============================================
146print("\n[4] PPO 개념 시뮬레이션")
147print("-" * 40)
148
149class SimplePPOSimulator:
150    """PPO 개념 시뮬레이션"""
151
152    def __init__(self, clip_epsilon=0.2, kl_coef=0.1):
153        self.clip_epsilon = clip_epsilon
154        self.kl_coef = kl_coef
155        self.policy_history = []
156
157    def compute_ratio(self, new_prob, old_prob):
158        """확률 비율 계산"""
159        return new_prob / (old_prob + 1e-10)
160
161    def clip_ratio(self, ratio):
162        """PPO 클리핑"""
163        return np.clip(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon)
164
165    def compute_ppo_objective(self, ratio, advantage):
166        """PPO 목적 함수"""
167        clipped_ratio = self.clip_ratio(ratio)
168        obj1 = ratio * advantage
169        obj2 = clipped_ratio * advantage
170        return min(obj1, obj2)  # 보수적 업데이트
171
172    def compute_kl_penalty(self, new_prob, old_prob):
173        """KL 페널티"""
174        kl = new_prob * np.log(new_prob / (old_prob + 1e-10) + 1e-10)
175        return self.kl_coef * kl
176
177# 테스트
178ppo = SimplePPOSimulator()
179print("PPO 클리핑 예시:")
180
181test_cases = [
182    (0.8, 0.5, 1.0),   # 확률 증가, 양의 어드밴티지
183    (0.3, 0.5, 1.0),   # 확률 감소, 양의 어드밴티지
184    (0.8, 0.5, -1.0),  # 확률 증가, 음의 어드밴티지
185]
186
187for new_p, old_p, adv in test_cases:
188    ratio = ppo.compute_ratio(new_p, old_p)
189    clipped = ppo.clip_ratio(ratio)
190    obj = ppo.compute_ppo_objective(ratio, adv)
191    print(f"  new_p={new_p}, old_p={old_p}, adv={adv}")
192    print(f"    ratio={ratio:.2f}, clipped={clipped:.2f}, objective={obj:.2f}")
193
194
195# ============================================
196# 5. SFT 데이터 형식
197# ============================================
198print("\n[5] SFT (Supervised Fine-Tuning) 데이터")
199print("-" * 40)
200
201# Alpaca 형식
202alpaca_data = [
203    {
204        "instruction": "다음 텍스트를 요약하세요.",
205        "input": "인공지능은 컴퓨터 과학의 한 분야로, 인간의 학습능력, 추론능력, "
206                 "지각능력, 자연언어 이해능력 등을 컴퓨터 프로그램으로 실현한 기술이다.",
207        "output": "인공지능은 인간의 지적 능력을 컴퓨터로 구현한 기술입니다."
208    },
209    {
210        "instruction": "다음 문장을 영어로 번역하세요.",
211        "input": "안녕하세요, 오늘 날씨가 좋네요.",
212        "output": "Hello, the weather is nice today."
213    }
214]
215
216print("Alpaca 형식 예시:")
217for item in alpaca_data:
218    print(f"\n  Instruction: {item['instruction']}")
219    print(f"  Input: {item['input'][:40]}...")
220    print(f"  Output: {item['output']}")
221
222# ChatML 형식
223chatml_example = """
224<|system|>
225You are a helpful assistant.
226<|user|>
227What is the capital of Korea?
228<|assistant|>
229The capital of South Korea is Seoul.
230"""
231
232print(f"\nChatML 형식 예시:{chatml_example}")
233
234
235# ============================================
236# 6. Constitutional AI 시뮬레이션
237# ============================================
238print("\n[6] Constitutional AI 시뮬레이션")
239print("-" * 40)
240
241class ConstitutionalAI:
242    """Constitutional AI 시뮬레이션"""
243
244    def __init__(self):
245        self.constitution = [
246            "응답은 도움이 되어야 합니다.",
247            "응답은 해로운 내용을 포함하지 않아야 합니다.",
248            "응답은 정직하고 사실에 기반해야 합니다.",
249            "차별적이거나 편견 있는 내용을 포함하지 않아야 합니다."
250        ]
251
252    def check_principles(self, response):
253        """원칙 위반 확인 (간단한 규칙 기반)"""
254        violations = []
255
256        # 해로운 키워드 체크
257        harmful_words = ["폭력", "위험한", "불법"]
258        if any(word in response for word in harmful_words):
259            violations.append("해로운 내용 포함 가능")
260
261        # 너무 짧은 응답
262        if len(response) < 20:
263            violations.append("충분히 도움이 되지 않음")
264
265        return violations
266
267    def critique(self, prompt, response):
268        """응답 비평"""
269        violations = self.check_principles(response)
270
271        critique = f"프롬프트: {prompt}\n응답: {response}\n\n원칙 검토:\n"
272        for i, principle in enumerate(self.constitution, 1):
273            critique += f"  {i}. {principle}\n"
274
275        if violations:
276            critique += f"\n위반 사항:\n"
277            for v in violations:
278                critique += f"  - {v}\n"
279        else:
280            critique += "\n모든 원칙 준수"
281
282        return critique, violations
283
284    def revise(self, response, violations):
285        """응답 수정 (시뮬레이션)"""
286        revised = response
287        if "충분히 도움이 되지 않음" in violations:
288            revised = response + " 추가적인 설명이 필요하시면 말씀해 주세요."
289        return revised
290
291
292# 테스트
293cai = ConstitutionalAI()
294
295test_responses = [
296    ("파이썬 배우는 방법?", "책을 읽으세요."),
297    ("운동의 효과?", "운동은 건강에 매우 좋습니다. 심혈관 기능 개선, 체중 관리, 정신 건강 향상 등 다양한 이점이 있습니다."),
298]
299
300print("Constitutional AI 검토:")
301for prompt, response in test_responses:
302    critique, violations = cai.critique(prompt, response)
303    print(f"\n{'-'*30}")
304    print(critique)
305    if violations:
306        revised = cai.revise(response, violations)
307        print(f"수정된 응답: {revised}")
308
309
310# ============================================
311# 7. TRL 라이브러리 사용법 (코드만)
312# ============================================
313print("\n[7] TRL 라이브러리 코드 예시")
314print("-" * 40)
315
316trl_code = '''
317# SFT (Supervised Fine-Tuning)
318from trl import SFTTrainer
319from transformers import TrainingArguments
320
321trainer = SFTTrainer(
322    model=model,
323    train_dataset=dataset,
324    formatting_func=format_instruction,
325    max_seq_length=1024,
326    args=TrainingArguments(
327        output_dir="./sft_model",
328        num_train_epochs=3,
329        per_device_train_batch_size=4,
330        learning_rate=2e-5,
331    ),
332)
333trainer.train()
334
335# DPO (Direct Preference Optimization)
336from trl import DPOTrainer, DPOConfig
337
338dpo_config = DPOConfig(
339    beta=0.1,  # 온도 파라미터
340    loss_type="sigmoid",
341    max_length=512,
342)
343
344trainer = DPOTrainer(
345    model=model,
346    ref_model=ref_model,
347    args=dpo_config,
348    train_dataset=preference_dataset,  # prompt, chosen, rejected
349    tokenizer=tokenizer,
350)
351trainer.train()
352
353# PPO (Proximal Policy Optimization)
354from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
355
356ppo_config = PPOConfig(
357    learning_rate=1.41e-5,
358    batch_size=16,
359    ppo_epochs=4,
360    target_kl=0.1,
361)
362
363model = AutoModelForCausalLMWithValueHead.from_pretrained("./sft_model")
364
365ppo_trainer = PPOTrainer(
366    config=ppo_config,
367    model=model,
368    ref_model=ref_model,
369    tokenizer=tokenizer,
370)
371
372# 학습 루프
373for batch in dataloader:
374    query_tensors = tokenize(batch["prompt"])
375    response_tensors = ppo_trainer.generate(query_tensors)
376    rewards = reward_model(query_tensors, response_tensors)
377    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
378'''
379print(trl_code)
380
381
382# ============================================
383# 8. Reward Model 학습 (코드만)
384# ============================================
385print("\n[8] Reward Model 학습 코드")
386print("-" * 40)
387
388reward_code = '''
389from transformers import AutoModelForSequenceClassification, TrainingArguments
390from trl import RewardTrainer
391
392# Reward Model (분류 헤드 추가)
393reward_model = AutoModelForSequenceClassification.from_pretrained(
394    "meta-llama/Llama-2-7b-hf",
395    num_labels=1  # 스칼라 출력
396)
397
398# 학습
399training_args = TrainingArguments(
400    output_dir="./reward_model",
401    num_train_epochs=1,
402    per_device_train_batch_size=4,
403    learning_rate=1e-5,
404)
405
406trainer = RewardTrainer(
407    model=reward_model,
408    args=training_args,
409    train_dataset=preference_dataset,
410    tokenizer=tokenizer,
411)
412trainer.train()
413
414# 보상 점수 계산
415def get_reward(prompt, response):
416    text = f"### Prompt: {prompt}\\n### Response: {response}"
417    inputs = tokenizer(text, return_tensors="pt")
418    with torch.no_grad():
419        reward = reward_model(**inputs).logits.squeeze().item()
420    return reward
421'''
422print(reward_code)
423
424
425# ============================================
426# 정리
427# ============================================
428print("\n" + "=" * 60)
429print("RLHF 정리")
430print("=" * 60)
431
432summary = """
433RLHF 파이프라인:
434
4351. SFT (Supervised Fine-Tuning)
436   - 고품질 데이터로 기본 능력 학습
437   - 형식: instruction, input, output
438
4392. Reward Model 학습
440   - 선호도 데이터로 보상 함수 학습
441   - 형식: prompt, chosen, rejected
442
4433. PPO (강화학습)
444   - Reward Model로 정책 최적화
445   - KL 페널티로 기준 모델과의 거리 제한
446
4474. DPO (Direct Preference Optimization)
448   - Reward Model 없이 직접 선호도 학습
449   - L = -log(sigmoid(β * (log π(y_w) - log π(y_l))))
450
4515. Constitutional AI
452   - 원칙 기반 자기 비평 및 수정
453   - 안전성 향상
454
455정렬 방법 선택:
456- 간단한 정렬: DPO (추천)
457- 복잡한 정렬: RLHF (PPO)
458- 안전성 중요: Constitutional AI
459"""
460print(summary)