12_practical_project.py

  1"""
  2실전 RL 프로젝트: 완전한 구조의 트레이딩 환경 구현
  3프로젝트 구조, 학습 파이프라인, 모델 저장/로드, 평가 및 시각화 포함
  4"""
  5import torch
  6import torch.nn as nn
  7import torch.nn.functional as F
  8import numpy as np
  9import matplotlib.pyplot as plt
 10from collections import deque
 11import json
 12import os
 13
 14
 15# =============================================================================
 16# 1. 사용자 정의 환경: 간단한 트레이딩 환경
 17# =============================================================================
 18
 19class SimpleTradingEnv:
 20    """
 21    간단한 주식 트레이딩 환경
 22    - 관측: 현재 가격, 이동평균, 보유 주식 수
 23    - 행동: 0(매도), 1(보유), 2(매수)
 24    - 보상: 포트폴리오 가치 변화
 25    """
 26
 27    def __init__(self, initial_balance=10000, stock_dim=1, max_steps=100):
 28        self.initial_balance = initial_balance
 29        self.stock_dim = stock_dim
 30        self.max_steps = max_steps
 31
 32        # 관측 공간: [가격, 5일 평균, 20일 평균, 보유 주식 수, 현금 비율]
 33        self.obs_dim = 5
 34        # 행동 공간: 매도(0), 보유(1), 매수(2)
 35        self.action_dim = 3
 36
 37        self.reset()
 38
 39    def _generate_price_series(self):
 40        """가격 시계열 생성 (랜덤 워크 + 트렌드)"""
 41        trend = np.random.choice([-1, 0, 1])  # 하락, 횡보, 상승
 42        prices = [100.0]
 43
 44        for _ in range(self.max_steps):
 45            # 랜덤 워크 + 트렌드
 46            change = np.random.randn() * 2 + trend * 0.5
 47            new_price = max(50.0, prices[-1] + change)  # 최소 가격 제한
 48            prices.append(new_price)
 49
 50        return np.array(prices)
 51
 52    def reset(self):
 53        """환경 초기화"""
 54        self.prices = self._generate_price_series()
 55        self.current_step = 0
 56
 57        self.balance = self.initial_balance
 58        self.shares_held = 0
 59        self.total_shares_bought = 0
 60        self.total_shares_sold = 0
 61
 62        return self._get_observation()
 63
 64    def _get_observation(self):
 65        """현재 관측 반환"""
 66        # 가격 정보
 67        current_price = self.prices[self.current_step]
 68
 69        # 이동평균 계산
 70        start_5 = max(0, self.current_step - 5)
 71        start_20 = max(0, self.current_step - 20)
 72        ma5 = np.mean(self.prices[start_5:self.current_step + 1])
 73        ma20 = np.mean(self.prices[start_20:self.current_step + 1])
 74
 75        # 포트폴리오 정보
 76        total_value = self.balance + self.shares_held * current_price
 77        cash_ratio = self.balance / total_value if total_value > 0 else 0
 78
 79        obs = np.array([
 80            current_price / 100.0,  # 정규화
 81            ma5 / 100.0,
 82            ma20 / 100.0,
 83            self.shares_held / 100.0,
 84            cash_ratio
 85        ], dtype=np.float32)
 86
 87        return obs
 88
 89    def step(self, action):
 90        """환경 스텝"""
 91        current_price = self.prices[self.current_step]
 92        prev_value = self.balance + self.shares_held * current_price
 93
 94        # 행동 실행
 95        if action == 0:  # 매도
 96            if self.shares_held > 0:
 97                self.balance += self.shares_held * current_price * 0.99  # 수수료 1%
 98                self.total_shares_sold += self.shares_held
 99                self.shares_held = 0
100
101        elif action == 2:  # 매수
102            shares_to_buy = self.balance // current_price
103            if shares_to_buy > 0:
104                cost = shares_to_buy * current_price * 1.01  # 수수료 1%
105                if cost <= self.balance:
106                    self.shares_held += shares_to_buy
107                    self.balance -= cost
108                    self.total_shares_bought += shares_to_buy
109
110        # 다음 스텝으로
111        self.current_step += 1
112
113        # 보상 계산: 포트폴리오 가치 변화
114        next_price = self.prices[self.current_step]
115        current_value = self.balance + self.shares_held * next_price
116        reward = (current_value - prev_value) / prev_value
117
118        # 종료 조건
119        done = self.current_step >= self.max_steps - 1
120
121        # 최종 보상 가산
122        if done:
123            # 최종 수익률에 따른 보너스/페널티
124            total_return = (current_value - self.initial_balance) / self.initial_balance
125            reward += total_return * 10
126
127        return self._get_observation(), reward, done, {}
128
129
130# =============================================================================
131# 2. 네트워크 아키텍처
132# =============================================================================
133
134class TradingPolicyNetwork(nn.Module):
135    """트레이딩 정책 네트워크"""
136
137    def __init__(self, obs_dim, action_dim, hidden_dim=128):
138        super().__init__()
139
140        self.feature_extractor = nn.Sequential(
141            nn.Linear(obs_dim, hidden_dim),
142            nn.ReLU(),
143            nn.Linear(hidden_dim, hidden_dim),
144            nn.ReLU()
145        )
146
147        self.actor = nn.Linear(hidden_dim, action_dim)
148        self.critic = nn.Linear(hidden_dim, 1)
149
150        # 가중치 초기화
151        self._init_weights()
152
153    def _init_weights(self):
154        for m in self.modules():
155            if isinstance(m, nn.Linear):
156                nn.init.orthogonal_(m.weight, gain=np.sqrt(2))
157                nn.init.constant_(m.bias, 0)
158
159    def forward(self, obs):
160        features = self.feature_extractor(obs)
161        return F.softmax(self.actor(features), dim=-1), self.critic(features)
162
163    def get_action_and_value(self, obs, action=None):
164        probs, value = self.forward(obs)
165        dist = torch.distributions.Categorical(probs)
166
167        if action is None:
168            action = dist.sample()
169
170        return action, dist.log_prob(action), dist.entropy(), value.squeeze(-1)
171
172
173# =============================================================================
174# 3. PPO 에이전트 (프로젝트용)
175# =============================================================================
176
177class PPOAgent:
178    """프로젝트용 PPO 에이전트"""
179
180    def __init__(self, config):
181        self.config = config
182
183        # 네트워크 생성
184        self.network = TradingPolicyNetwork(
185            obs_dim=config['obs_dim'],
186            action_dim=config['action_dim'],
187            hidden_dim=config.get('hidden_dim', 128)
188        )
189
190        self.optimizer = torch.optim.Adam(
191            self.network.parameters(),
192            lr=config.get('lr', 3e-4)
193        )
194
195        # 하이퍼파라미터
196        self.gamma = config.get('gamma', 0.99)
197        self.gae_lambda = config.get('gae_lambda', 0.95)
198        self.clip_epsilon = config.get('clip_epsilon', 0.2)
199        self.value_coef = config.get('value_coef', 0.5)
200        self.entropy_coef = config.get('entropy_coef', 0.01)
201        self.max_grad_norm = config.get('max_grad_norm', 0.5)
202        self.n_epochs = config.get('n_epochs', 10)
203        self.batch_size = config.get('batch_size', 64)
204
205    def collect_rollout(self, env, n_steps):
206        """경험 수집"""
207        rollout = {
208            'obs': [], 'actions': [], 'rewards': [], 'dones': [],
209            'values': [], 'log_probs': []
210        }
211
212        obs = env.reset()
213
214        for _ in range(n_steps):
215            obs_tensor = torch.FloatTensor(obs).unsqueeze(0)
216
217            with torch.no_grad():
218                action, log_prob, _, value = self.network.get_action_and_value(obs_tensor)
219
220            next_obs, reward, done, _ = env.step(action.item())
221
222            rollout['obs'].append(obs)
223            rollout['actions'].append(action.item())
224            rollout['rewards'].append(reward)
225            rollout['dones'].append(done)
226            rollout['values'].append(value.item())
227            rollout['log_probs'].append(log_prob.item())
228
229            obs = next_obs if not done else env.reset()
230
231        # 마지막 가치 추정
232        with torch.no_grad():
233            _, _, _, last_value = self.network.get_action_and_value(
234                torch.FloatTensor(obs).unsqueeze(0)
235            )
236            rollout['last_value'] = last_value.item()
237
238        # NumPy 배열로 변환
239        for key in ['obs', 'actions', 'rewards', 'dones', 'values', 'log_probs']:
240            rollout[key] = np.array(rollout[key])
241
242        return rollout
243
244    def compute_gae(self, rollout):
245        """GAE 계산"""
246        rewards = rollout['rewards']
247        values = rollout['values']
248        dones = rollout['dones']
249        last_value = rollout['last_value']
250
251        advantages = np.zeros_like(rewards)
252        last_gae = 0
253
254        for t in reversed(range(len(rewards))):
255            next_val = last_value if t == len(rewards) - 1 else values[t + 1]
256            delta = rewards[t] + self.gamma * next_val * (1 - dones[t]) - values[t]
257            advantages[t] = last_gae = delta + self.gamma * self.gae_lambda * (1 - dones[t]) * last_gae
258
259        returns = advantages + values
260        return advantages, returns
261
262    def update(self, rollout):
263        """PPO 업데이트"""
264        advantages, returns = self.compute_gae(rollout)
265        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
266
267        # 텐서 변환
268        obs = torch.FloatTensor(rollout['obs'])
269        actions = torch.LongTensor(rollout['actions'])
270        old_log_probs = torch.FloatTensor(rollout['log_probs'])
271        advantages_tensor = torch.FloatTensor(advantages)
272        returns_tensor = torch.FloatTensor(returns)
273
274        # 여러 에폭 학습
275        total_loss = 0
276        n_updates = 0
277
278        for _ in range(self.n_epochs):
279            indices = np.random.permutation(len(obs))
280
281            for start in range(0, len(obs), self.batch_size):
282                idx = indices[start:start + self.batch_size]
283
284                _, new_log_probs, entropy, values = self.network.get_action_and_value(
285                    obs[idx], actions[idx]
286                )
287
288                # PPO loss
289                ratio = torch.exp(new_log_probs - old_log_probs[idx])
290                surr1 = ratio * advantages_tensor[idx]
291                surr2 = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * advantages_tensor[idx]
292
293                actor_loss = -torch.min(surr1, surr2).mean()
294                critic_loss = F.mse_loss(values, returns_tensor[idx])
295                entropy_loss = -entropy.mean()
296
297                loss = actor_loss + self.value_coef * critic_loss + self.entropy_coef * entropy_loss
298
299                self.optimizer.zero_grad()
300                loss.backward()
301                nn.utils.clip_grad_norm_(self.network.parameters(), self.max_grad_norm)
302                self.optimizer.step()
303
304                total_loss += loss.item()
305                n_updates += 1
306
307        return total_loss / n_updates if n_updates > 0 else 0
308
309    def save(self, filepath):
310        """모델 저장"""
311        torch.save({
312            'network_state_dict': self.network.state_dict(),
313            'optimizer_state_dict': self.optimizer.state_dict(),
314            'config': self.config
315        }, filepath)
316
317    def load(self, filepath):
318        """모델 로드"""
319        checkpoint = torch.load(filepath)
320        self.network.load_state_dict(checkpoint['network_state_dict'])
321        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
322
323
324# =============================================================================
325# 4. 학습 파이프라인
326# =============================================================================
327
328class TrainingLogger:
329    """학습 로거"""
330
331    def __init__(self, log_dir='logs'):
332        self.log_dir = log_dir
333        os.makedirs(log_dir, exist_ok=True)
334
335        self.metrics = {
336            'episodes': [],
337            'rewards': [],
338            'losses': [],
339            'returns': []
340        }
341
342    def log(self, episode, reward, loss, portfolio_return):
343        """메트릭 기록"""
344        self.metrics['episodes'].append(episode)
345        self.metrics['rewards'].append(reward)
346        self.metrics['losses'].append(loss)
347        self.metrics['returns'].append(portfolio_return)
348
349    def save(self):
350        """로그 저장"""
351        filepath = os.path.join(self.log_dir, 'training_log.json')
352        with open(filepath, 'w') as f:
353            json.dump(self.metrics, f, indent=2)
354
355    def plot(self):
356        """학습 진행 시각화"""
357        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
358
359        # 에피소드 보상
360        axes[0, 0].plot(self.metrics['episodes'], self.metrics['rewards'], alpha=0.3)
361        if len(self.metrics['rewards']) > 10:
362            window = min(50, len(self.metrics['rewards']) // 10)
363            smoothed = np.convolve(self.metrics['rewards'], np.ones(window)/window, mode='valid')
364            axes[0, 0].plot(range(window-1, len(self.metrics['rewards'])), smoothed, linewidth=2)
365        axes[0, 0].set_title('에피소드 보상')
366        axes[0, 0].set_xlabel('Episode')
367        axes[0, 0].set_ylabel('Reward')
368        axes[0, 0].grid(True, alpha=0.3)
369
370        # 손실
371        axes[0, 1].plot(self.metrics['episodes'], self.metrics['losses'])
372        axes[0, 1].set_title('학습 손실')
373        axes[0, 1].set_xlabel('Episode')
374        axes[0, 1].set_ylabel('Loss')
375        axes[0, 1].grid(True, alpha=0.3)
376
377        # 수익률
378        axes[1, 0].plot(self.metrics['episodes'], self.metrics['returns'], alpha=0.3)
379        if len(self.metrics['returns']) > 10:
380            window = min(50, len(self.metrics['returns']) // 10)
381            smoothed = np.convolve(self.metrics['returns'], np.ones(window)/window, mode='valid')
382            axes[1, 0].plot(range(window-1, len(self.metrics['returns'])), smoothed, linewidth=2)
383        axes[1, 0].axhline(y=0, color='r', linestyle='--', alpha=0.3)
384        axes[1, 0].set_title('포트폴리오 수익률')
385        axes[1, 0].set_xlabel('Episode')
386        axes[1, 0].set_ylabel('Return (%)')
387        axes[1, 0].grid(True, alpha=0.3)
388
389        # 수익률 분포
390        axes[1, 1].hist(self.metrics['returns'], bins=30, alpha=0.7, edgecolor='black')
391        axes[1, 1].axvline(x=0, color='r', linestyle='--', linewidth=2)
392        axes[1, 1].set_title('수익률 분포')
393        axes[1, 1].set_xlabel('Return (%)')
394        axes[1, 1].set_ylabel('Frequency')
395        axes[1, 1].grid(True, alpha=0.3)
396
397        plt.tight_layout()
398        plt.savefig(os.path.join(self.log_dir, 'training_progress.png'), dpi=100, bbox_inches='tight')
399        print(f"학습 그래프 저장: {self.log_dir}/training_progress.png")
400
401
402def train_agent(config):
403    """에이전트 학습"""
404    # 환경 생성
405    env = SimpleTradingEnv(
406        initial_balance=config['initial_balance'],
407        max_steps=config['max_steps']
408    )
409
410    # 에이전트 생성
411    agent_config = {
412        'obs_dim': env.obs_dim,
413        'action_dim': env.action_dim,
414        'hidden_dim': config['hidden_dim'],
415        'lr': config['lr'],
416        'gamma': config['gamma'],
417        'gae_lambda': config['gae_lambda'],
418        'clip_epsilon': config['clip_epsilon'],
419        'n_epochs': config['n_epochs'],
420        'batch_size': config['batch_size']
421    }
422    agent = PPOAgent(agent_config)
423
424    # 로거
425    logger = TrainingLogger(log_dir=config['log_dir'])
426
427    # 학습 루프
428    n_episodes = config['n_episodes']
429    n_steps = config['n_steps']
430
431    print("학습 시작...\n")
432
433    for episode in range(n_episodes):
434        # 롤아웃 수집
435        rollout = agent.collect_rollout(env, n_steps)
436
437        # 에피소드 통계
438        episode_reward = rollout['rewards'].sum()
439        final_obs = rollout['obs'][-1]
440        portfolio_return = (final_obs[3] * 100 + final_obs[4] * config['initial_balance'] - config['initial_balance']) / config['initial_balance'] * 100
441
442        # 업데이트
443        loss = agent.update(rollout)
444
445        # 로깅
446        logger.log(episode, episode_reward, loss, portfolio_return)
447
448        if (episode + 1) % config['log_interval'] == 0:
449            avg_reward = np.mean(logger.metrics['rewards'][-config['log_interval']:])
450            avg_return = np.mean(logger.metrics['returns'][-config['log_interval']:])
451            print(f"Episode {episode + 1}/{n_episodes} | "
452                  f"Avg Reward: {avg_reward:.2f} | "
453                  f"Avg Return: {avg_return:.2f}% | "
454                  f"Loss: {loss:.4f}")
455
456        # 체크포인트 저장
457        if (episode + 1) % config['save_interval'] == 0:
458            save_path = os.path.join(config['checkpoint_dir'], f'agent_ep{episode + 1}.pt')
459            agent.save(save_path)
460            print(f"  체크포인트 저장: {save_path}")
461
462    # 최종 모델 저장
463    final_path = os.path.join(config['checkpoint_dir'], 'agent_final.pt')
464    agent.save(final_path)
465
466    # 로그 저장 및 시각화
467    logger.save()
468    logger.plot()
469
470    print("\n학습 완료!")
471    return agent, logger
472
473
474# =============================================================================
475# 5. 평가
476# =============================================================================
477
478def evaluate_agent(agent, n_episodes=10, render=False):
479    """학습된 에이전트 평가"""
480    env = SimpleTradingEnv()
481    episode_returns = []
482
483    print("\n=== 에이전트 평가 ===\n")
484
485    for episode in range(n_episodes):
486        obs = env.reset()
487        total_reward = 0
488        done = False
489
490        while not done:
491            obs_tensor = torch.FloatTensor(obs).unsqueeze(0)
492
493            with torch.no_grad():
494                action, _, _, _ = agent.network.get_action_and_value(obs_tensor)
495
496            obs, reward, done, _ = env.step(action.item())
497            total_reward += reward
498
499        # 최종 수익률
500        final_value = env.balance + env.shares_held * env.prices[env.current_step]
501        portfolio_return = (final_value - env.initial_balance) / env.initial_balance * 100
502
503        episode_returns.append(portfolio_return)
504        print(f"Episode {episode + 1}: Return = {portfolio_return:.2f}%")
505
506    mean_return = np.mean(episode_returns)
507    std_return = np.std(episode_returns)
508
509    print(f"\n평균 수익률: {mean_return:.2f}% ± {std_return:.2f}%")
510
511    return episode_returns
512
513
514# =============================================================================
515# 6. 메인
516# =============================================================================
517
518if __name__ == "__main__":
519    # 프로젝트 설정
520    config = {
521        # 환경
522        'initial_balance': 10000,
523        'max_steps': 100,
524
525        # 네트워크
526        'hidden_dim': 128,
527
528        # 학습
529        'lr': 3e-4,
530        'gamma': 0.99,
531        'gae_lambda': 0.95,
532        'clip_epsilon': 0.2,
533        'n_epochs': 10,
534        'batch_size': 64,
535
536        # 학습 파라미터
537        'n_episodes': 500,
538        'n_steps': 100,
539        'log_interval': 50,
540        'save_interval': 100,
541
542        # 디렉토리
543        'log_dir': 'logs',
544        'checkpoint_dir': 'checkpoints'
545    }
546
547    # 디렉토리 생성
548    os.makedirs(config['log_dir'], exist_ok=True)
549    os.makedirs(config['checkpoint_dir'], exist_ok=True)
550
551    print("=" * 60)
552    print("실전 RL 프로젝트: 트레이딩 에이전트")
553    print("=" * 60)
554
555    # 설정 출력
556    print("\n설정:")
557    for key, value in config.items():
558        print(f"  {key}: {value}")
559
560    # 학습
561    agent, logger = train_agent(config)
562
563    # 평가
564    returns = evaluate_agent(agent, n_episodes=20)
565
566    print("\n프로젝트 완료!")
567    print("\n생성된 파일:")
568    print(f"  - {config['log_dir']}/training_log.json")
569    print(f"  - {config['log_dir']}/training_progress.png")
570    print(f"  - {config['checkpoint_dir']}/agent_final.pt")
571
572    print("\n주요 학습 내용:")
573    print("  1. 사용자 정의 환경 구현 (Gymnasium 스타일)")
574    print("  2. 모듈화된 PPO 에이전트")
575    print("  3. 학습 파이프라인 (수집-업데이트-로깅)")
576    print("  4. 모델 저장/로드")
577    print("  5. 평가 및 시각화")
578    print("  6. 프로젝트 구조 모범 사례")