1"""
2μ€μ RL νλ‘μ νΈ: μμ ν ꡬ쑰μ νΈλ μ΄λ© νκ²½ ꡬν
3νλ‘μ νΈ κ΅¬μ‘°, νμ΅ νμ΄νλΌμΈ, λͺ¨λΈ μ μ₯/λ‘λ, νκ° λ° μκ°ν ν¬ν¨
4"""
5import torch
6import torch.nn as nn
7import torch.nn.functional as F
8import numpy as np
9import matplotlib.pyplot as plt
10from collections import deque
11import json
12import os
13
14
15# =============================================================================
16# 1. μ¬μ©μ μ μ νκ²½: κ°λ¨ν νΈλ μ΄λ© νκ²½
17# =============================================================================
18
19class SimpleTradingEnv:
20 """
21 κ°λ¨ν μ£Όμ νΈλ μ΄λ© νκ²½
22 - κ΄μΈ‘: νμ¬ κ°κ²©, μ΄λνκ· , 보μ μ£Όμ μ
23 - νλ: 0(λ§€λ), 1(보μ ), 2(λ§€μ)
24 - 보μ: ν¬νΈν΄λ¦¬μ€ κ°μΉ λ³ν
25 """
26
27 def __init__(self, initial_balance=10000, stock_dim=1, max_steps=100):
28 self.initial_balance = initial_balance
29 self.stock_dim = stock_dim
30 self.max_steps = max_steps
31
32 # κ΄μΈ‘ 곡κ°: [κ°κ²©, 5μΌ νκ· , 20μΌ νκ· , 보μ μ£Όμ μ, νκΈ λΉμ¨]
33 self.obs_dim = 5
34 # νλ 곡κ°: λ§€λ(0), 보μ (1), λ§€μ(2)
35 self.action_dim = 3
36
37 self.reset()
38
39 def _generate_price_series(self):
40 """κ°κ²© μκ³μ΄ μμ± (λλ€ μν¬ + νΈλ λ)"""
41 trend = np.random.choice([-1, 0, 1]) # νλ½, ν‘보, μμΉ
42 prices = [100.0]
43
44 for _ in range(self.max_steps):
45 # λλ€ μν¬ + νΈλ λ
46 change = np.random.randn() * 2 + trend * 0.5
47 new_price = max(50.0, prices[-1] + change) # μ΅μ κ°κ²© μ ν
48 prices.append(new_price)
49
50 return np.array(prices)
51
52 def reset(self):
53 """νκ²½ μ΄κΈ°ν"""
54 self.prices = self._generate_price_series()
55 self.current_step = 0
56
57 self.balance = self.initial_balance
58 self.shares_held = 0
59 self.total_shares_bought = 0
60 self.total_shares_sold = 0
61
62 return self._get_observation()
63
64 def _get_observation(self):
65 """νμ¬ κ΄μΈ‘ λ°ν"""
66 # κ°κ²© μ 보
67 current_price = self.prices[self.current_step]
68
69 # μ΄λνκ· κ³μ°
70 start_5 = max(0, self.current_step - 5)
71 start_20 = max(0, self.current_step - 20)
72 ma5 = np.mean(self.prices[start_5:self.current_step + 1])
73 ma20 = np.mean(self.prices[start_20:self.current_step + 1])
74
75 # ν¬νΈν΄λ¦¬μ€ μ 보
76 total_value = self.balance + self.shares_held * current_price
77 cash_ratio = self.balance / total_value if total_value > 0 else 0
78
79 obs = np.array([
80 current_price / 100.0, # μ κ·ν
81 ma5 / 100.0,
82 ma20 / 100.0,
83 self.shares_held / 100.0,
84 cash_ratio
85 ], dtype=np.float32)
86
87 return obs
88
89 def step(self, action):
90 """νκ²½ μ€ν
"""
91 current_price = self.prices[self.current_step]
92 prev_value = self.balance + self.shares_held * current_price
93
94 # νλ μ€ν
95 if action == 0: # λ§€λ
96 if self.shares_held > 0:
97 self.balance += self.shares_held * current_price * 0.99 # μμλ£ 1%
98 self.total_shares_sold += self.shares_held
99 self.shares_held = 0
100
101 elif action == 2: # λ§€μ
102 shares_to_buy = self.balance // current_price
103 if shares_to_buy > 0:
104 cost = shares_to_buy * current_price * 1.01 # μμλ£ 1%
105 if cost <= self.balance:
106 self.shares_held += shares_to_buy
107 self.balance -= cost
108 self.total_shares_bought += shares_to_buy
109
110 # λ€μ μ€ν
μΌλ‘
111 self.current_step += 1
112
113 # 보μ κ³μ°: ν¬νΈν΄λ¦¬μ€ κ°μΉ λ³ν
114 next_price = self.prices[self.current_step]
115 current_value = self.balance + self.shares_held * next_price
116 reward = (current_value - prev_value) / prev_value
117
118 # μ’
λ£ μ‘°κ±΄
119 done = self.current_step >= self.max_steps - 1
120
121 # μ΅μ’
보μ κ°μ°
122 if done:
123 # μ΅μ’
μμ΅λ₯ μ λ°λ₯Έ 보λμ€/νλν°
124 total_return = (current_value - self.initial_balance) / self.initial_balance
125 reward += total_return * 10
126
127 return self._get_observation(), reward, done, {}
128
129
130# =============================================================================
131# 2. λ€νΈμν¬ μν€ν
μ²
132# =============================================================================
133
134class TradingPolicyNetwork(nn.Module):
135 """νΈλ μ΄λ© μ μ±
λ€νΈμν¬"""
136
137 def __init__(self, obs_dim, action_dim, hidden_dim=128):
138 super().__init__()
139
140 self.feature_extractor = nn.Sequential(
141 nn.Linear(obs_dim, hidden_dim),
142 nn.ReLU(),
143 nn.Linear(hidden_dim, hidden_dim),
144 nn.ReLU()
145 )
146
147 self.actor = nn.Linear(hidden_dim, action_dim)
148 self.critic = nn.Linear(hidden_dim, 1)
149
150 # κ°μ€μΉ μ΄κΈ°ν
151 self._init_weights()
152
153 def _init_weights(self):
154 for m in self.modules():
155 if isinstance(m, nn.Linear):
156 nn.init.orthogonal_(m.weight, gain=np.sqrt(2))
157 nn.init.constant_(m.bias, 0)
158
159 def forward(self, obs):
160 features = self.feature_extractor(obs)
161 return F.softmax(self.actor(features), dim=-1), self.critic(features)
162
163 def get_action_and_value(self, obs, action=None):
164 probs, value = self.forward(obs)
165 dist = torch.distributions.Categorical(probs)
166
167 if action is None:
168 action = dist.sample()
169
170 return action, dist.log_prob(action), dist.entropy(), value.squeeze(-1)
171
172
173# =============================================================================
174# 3. PPO μμ΄μ νΈ (νλ‘μ νΈμ©)
175# =============================================================================
176
177class PPOAgent:
178 """νλ‘μ νΈμ© PPO μμ΄μ νΈ"""
179
180 def __init__(self, config):
181 self.config = config
182
183 # λ€νΈμν¬ μμ±
184 self.network = TradingPolicyNetwork(
185 obs_dim=config['obs_dim'],
186 action_dim=config['action_dim'],
187 hidden_dim=config.get('hidden_dim', 128)
188 )
189
190 self.optimizer = torch.optim.Adam(
191 self.network.parameters(),
192 lr=config.get('lr', 3e-4)
193 )
194
195 # νμ΄νΌνλΌλ―Έν°
196 self.gamma = config.get('gamma', 0.99)
197 self.gae_lambda = config.get('gae_lambda', 0.95)
198 self.clip_epsilon = config.get('clip_epsilon', 0.2)
199 self.value_coef = config.get('value_coef', 0.5)
200 self.entropy_coef = config.get('entropy_coef', 0.01)
201 self.max_grad_norm = config.get('max_grad_norm', 0.5)
202 self.n_epochs = config.get('n_epochs', 10)
203 self.batch_size = config.get('batch_size', 64)
204
205 def collect_rollout(self, env, n_steps):
206 """κ²½ν μμ§"""
207 rollout = {
208 'obs': [], 'actions': [], 'rewards': [], 'dones': [],
209 'values': [], 'log_probs': []
210 }
211
212 obs = env.reset()
213
214 for _ in range(n_steps):
215 obs_tensor = torch.FloatTensor(obs).unsqueeze(0)
216
217 with torch.no_grad():
218 action, log_prob, _, value = self.network.get_action_and_value(obs_tensor)
219
220 next_obs, reward, done, _ = env.step(action.item())
221
222 rollout['obs'].append(obs)
223 rollout['actions'].append(action.item())
224 rollout['rewards'].append(reward)
225 rollout['dones'].append(done)
226 rollout['values'].append(value.item())
227 rollout['log_probs'].append(log_prob.item())
228
229 obs = next_obs if not done else env.reset()
230
231 # λ§μ§λ§ κ°μΉ μΆμ
232 with torch.no_grad():
233 _, _, _, last_value = self.network.get_action_and_value(
234 torch.FloatTensor(obs).unsqueeze(0)
235 )
236 rollout['last_value'] = last_value.item()
237
238 # NumPy λ°°μ΄λ‘ λ³ν
239 for key in ['obs', 'actions', 'rewards', 'dones', 'values', 'log_probs']:
240 rollout[key] = np.array(rollout[key])
241
242 return rollout
243
244 def compute_gae(self, rollout):
245 """GAE κ³μ°"""
246 rewards = rollout['rewards']
247 values = rollout['values']
248 dones = rollout['dones']
249 last_value = rollout['last_value']
250
251 advantages = np.zeros_like(rewards)
252 last_gae = 0
253
254 for t in reversed(range(len(rewards))):
255 next_val = last_value if t == len(rewards) - 1 else values[t + 1]
256 delta = rewards[t] + self.gamma * next_val * (1 - dones[t]) - values[t]
257 advantages[t] = last_gae = delta + self.gamma * self.gae_lambda * (1 - dones[t]) * last_gae
258
259 returns = advantages + values
260 return advantages, returns
261
262 def update(self, rollout):
263 """PPO μ
λ°μ΄νΈ"""
264 advantages, returns = self.compute_gae(rollout)
265 advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
266
267 # ν
μ λ³ν
268 obs = torch.FloatTensor(rollout['obs'])
269 actions = torch.LongTensor(rollout['actions'])
270 old_log_probs = torch.FloatTensor(rollout['log_probs'])
271 advantages_tensor = torch.FloatTensor(advantages)
272 returns_tensor = torch.FloatTensor(returns)
273
274 # μ¬λ¬ μν νμ΅
275 total_loss = 0
276 n_updates = 0
277
278 for _ in range(self.n_epochs):
279 indices = np.random.permutation(len(obs))
280
281 for start in range(0, len(obs), self.batch_size):
282 idx = indices[start:start + self.batch_size]
283
284 _, new_log_probs, entropy, values = self.network.get_action_and_value(
285 obs[idx], actions[idx]
286 )
287
288 # PPO loss
289 ratio = torch.exp(new_log_probs - old_log_probs[idx])
290 surr1 = ratio * advantages_tensor[idx]
291 surr2 = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * advantages_tensor[idx]
292
293 actor_loss = -torch.min(surr1, surr2).mean()
294 critic_loss = F.mse_loss(values, returns_tensor[idx])
295 entropy_loss = -entropy.mean()
296
297 loss = actor_loss + self.value_coef * critic_loss + self.entropy_coef * entropy_loss
298
299 self.optimizer.zero_grad()
300 loss.backward()
301 nn.utils.clip_grad_norm_(self.network.parameters(), self.max_grad_norm)
302 self.optimizer.step()
303
304 total_loss += loss.item()
305 n_updates += 1
306
307 return total_loss / n_updates if n_updates > 0 else 0
308
309 def save(self, filepath):
310 """λͺ¨λΈ μ μ₯"""
311 torch.save({
312 'network_state_dict': self.network.state_dict(),
313 'optimizer_state_dict': self.optimizer.state_dict(),
314 'config': self.config
315 }, filepath)
316
317 def load(self, filepath):
318 """λͺ¨λΈ λ‘λ"""
319 checkpoint = torch.load(filepath)
320 self.network.load_state_dict(checkpoint['network_state_dict'])
321 self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
322
323
324# =============================================================================
325# 4. νμ΅ νμ΄νλΌμΈ
326# =============================================================================
327
328class TrainingLogger:
329 """νμ΅ λ‘κ±°"""
330
331 def __init__(self, log_dir='logs'):
332 self.log_dir = log_dir
333 os.makedirs(log_dir, exist_ok=True)
334
335 self.metrics = {
336 'episodes': [],
337 'rewards': [],
338 'losses': [],
339 'returns': []
340 }
341
342 def log(self, episode, reward, loss, portfolio_return):
343 """λ©νΈλ¦ κΈ°λ‘"""
344 self.metrics['episodes'].append(episode)
345 self.metrics['rewards'].append(reward)
346 self.metrics['losses'].append(loss)
347 self.metrics['returns'].append(portfolio_return)
348
349 def save(self):
350 """λ‘κ·Έ μ μ₯"""
351 filepath = os.path.join(self.log_dir, 'training_log.json')
352 with open(filepath, 'w') as f:
353 json.dump(self.metrics, f, indent=2)
354
355 def plot(self):
356 """νμ΅ μ§ν μκ°ν"""
357 fig, axes = plt.subplots(2, 2, figsize=(14, 10))
358
359 # μνΌμλ 보μ
360 axes[0, 0].plot(self.metrics['episodes'], self.metrics['rewards'], alpha=0.3)
361 if len(self.metrics['rewards']) > 10:
362 window = min(50, len(self.metrics['rewards']) // 10)
363 smoothed = np.convolve(self.metrics['rewards'], np.ones(window)/window, mode='valid')
364 axes[0, 0].plot(range(window-1, len(self.metrics['rewards'])), smoothed, linewidth=2)
365 axes[0, 0].set_title('μνΌμλ 보μ')
366 axes[0, 0].set_xlabel('Episode')
367 axes[0, 0].set_ylabel('Reward')
368 axes[0, 0].grid(True, alpha=0.3)
369
370 # μμ€
371 axes[0, 1].plot(self.metrics['episodes'], self.metrics['losses'])
372 axes[0, 1].set_title('νμ΅ μμ€')
373 axes[0, 1].set_xlabel('Episode')
374 axes[0, 1].set_ylabel('Loss')
375 axes[0, 1].grid(True, alpha=0.3)
376
377 # μμ΅λ₯
378 axes[1, 0].plot(self.metrics['episodes'], self.metrics['returns'], alpha=0.3)
379 if len(self.metrics['returns']) > 10:
380 window = min(50, len(self.metrics['returns']) // 10)
381 smoothed = np.convolve(self.metrics['returns'], np.ones(window)/window, mode='valid')
382 axes[1, 0].plot(range(window-1, len(self.metrics['returns'])), smoothed, linewidth=2)
383 axes[1, 0].axhline(y=0, color='r', linestyle='--', alpha=0.3)
384 axes[1, 0].set_title('ν¬νΈν΄λ¦¬μ€ μμ΅λ₯ ')
385 axes[1, 0].set_xlabel('Episode')
386 axes[1, 0].set_ylabel('Return (%)')
387 axes[1, 0].grid(True, alpha=0.3)
388
389 # μμ΅λ₯ λΆν¬
390 axes[1, 1].hist(self.metrics['returns'], bins=30, alpha=0.7, edgecolor='black')
391 axes[1, 1].axvline(x=0, color='r', linestyle='--', linewidth=2)
392 axes[1, 1].set_title('μμ΅λ₯ λΆν¬')
393 axes[1, 1].set_xlabel('Return (%)')
394 axes[1, 1].set_ylabel('Frequency')
395 axes[1, 1].grid(True, alpha=0.3)
396
397 plt.tight_layout()
398 plt.savefig(os.path.join(self.log_dir, 'training_progress.png'), dpi=100, bbox_inches='tight')
399 print(f"νμ΅ κ·Έλν μ μ₯: {self.log_dir}/training_progress.png")
400
401
402def train_agent(config):
403 """μμ΄μ νΈ νμ΅"""
404 # νκ²½ μμ±
405 env = SimpleTradingEnv(
406 initial_balance=config['initial_balance'],
407 max_steps=config['max_steps']
408 )
409
410 # μμ΄μ νΈ μμ±
411 agent_config = {
412 'obs_dim': env.obs_dim,
413 'action_dim': env.action_dim,
414 'hidden_dim': config['hidden_dim'],
415 'lr': config['lr'],
416 'gamma': config['gamma'],
417 'gae_lambda': config['gae_lambda'],
418 'clip_epsilon': config['clip_epsilon'],
419 'n_epochs': config['n_epochs'],
420 'batch_size': config['batch_size']
421 }
422 agent = PPOAgent(agent_config)
423
424 # λ‘κ±°
425 logger = TrainingLogger(log_dir=config['log_dir'])
426
427 # νμ΅ λ£¨ν
428 n_episodes = config['n_episodes']
429 n_steps = config['n_steps']
430
431 print("νμ΅ μμ...\n")
432
433 for episode in range(n_episodes):
434 # λ‘€μμ μμ§
435 rollout = agent.collect_rollout(env, n_steps)
436
437 # μνΌμλ ν΅κ³
438 episode_reward = rollout['rewards'].sum()
439 final_obs = rollout['obs'][-1]
440 portfolio_return = (final_obs[3] * 100 + final_obs[4] * config['initial_balance'] - config['initial_balance']) / config['initial_balance'] * 100
441
442 # μ
λ°μ΄νΈ
443 loss = agent.update(rollout)
444
445 # λ‘κΉ
446 logger.log(episode, episode_reward, loss, portfolio_return)
447
448 if (episode + 1) % config['log_interval'] == 0:
449 avg_reward = np.mean(logger.metrics['rewards'][-config['log_interval']:])
450 avg_return = np.mean(logger.metrics['returns'][-config['log_interval']:])
451 print(f"Episode {episode + 1}/{n_episodes} | "
452 f"Avg Reward: {avg_reward:.2f} | "
453 f"Avg Return: {avg_return:.2f}% | "
454 f"Loss: {loss:.4f}")
455
456 # 체ν¬ν¬μΈνΈ μ μ₯
457 if (episode + 1) % config['save_interval'] == 0:
458 save_path = os.path.join(config['checkpoint_dir'], f'agent_ep{episode + 1}.pt')
459 agent.save(save_path)
460 print(f" 체ν¬ν¬μΈνΈ μ μ₯: {save_path}")
461
462 # μ΅μ’
λͺ¨λΈ μ μ₯
463 final_path = os.path.join(config['checkpoint_dir'], 'agent_final.pt')
464 agent.save(final_path)
465
466 # λ‘κ·Έ μ μ₯ λ° μκ°ν
467 logger.save()
468 logger.plot()
469
470 print("\nνμ΅ μλ£!")
471 return agent, logger
472
473
474# =============================================================================
475# 5. νκ°
476# =============================================================================
477
478def evaluate_agent(agent, n_episodes=10, render=False):
479 """νμ΅λ μμ΄μ νΈ νκ°"""
480 env = SimpleTradingEnv()
481 episode_returns = []
482
483 print("\n=== μμ΄μ νΈ νκ° ===\n")
484
485 for episode in range(n_episodes):
486 obs = env.reset()
487 total_reward = 0
488 done = False
489
490 while not done:
491 obs_tensor = torch.FloatTensor(obs).unsqueeze(0)
492
493 with torch.no_grad():
494 action, _, _, _ = agent.network.get_action_and_value(obs_tensor)
495
496 obs, reward, done, _ = env.step(action.item())
497 total_reward += reward
498
499 # μ΅μ’
μμ΅λ₯
500 final_value = env.balance + env.shares_held * env.prices[env.current_step]
501 portfolio_return = (final_value - env.initial_balance) / env.initial_balance * 100
502
503 episode_returns.append(portfolio_return)
504 print(f"Episode {episode + 1}: Return = {portfolio_return:.2f}%")
505
506 mean_return = np.mean(episode_returns)
507 std_return = np.std(episode_returns)
508
509 print(f"\nνκ· μμ΅λ₯ : {mean_return:.2f}% Β± {std_return:.2f}%")
510
511 return episode_returns
512
513
514# =============================================================================
515# 6. λ©μΈ
516# =============================================================================
517
518if __name__ == "__main__":
519 # νλ‘μ νΈ μ€μ
520 config = {
521 # νκ²½
522 'initial_balance': 10000,
523 'max_steps': 100,
524
525 # λ€νΈμν¬
526 'hidden_dim': 128,
527
528 # νμ΅
529 'lr': 3e-4,
530 'gamma': 0.99,
531 'gae_lambda': 0.95,
532 'clip_epsilon': 0.2,
533 'n_epochs': 10,
534 'batch_size': 64,
535
536 # νμ΅ νλΌλ―Έν°
537 'n_episodes': 500,
538 'n_steps': 100,
539 'log_interval': 50,
540 'save_interval': 100,
541
542 # λλ ν 리
543 'log_dir': 'logs',
544 'checkpoint_dir': 'checkpoints'
545 }
546
547 # λλ ν 리 μμ±
548 os.makedirs(config['log_dir'], exist_ok=True)
549 os.makedirs(config['checkpoint_dir'], exist_ok=True)
550
551 print("=" * 60)
552 print("μ€μ RL νλ‘μ νΈ: νΈλ μ΄λ© μμ΄μ νΈ")
553 print("=" * 60)
554
555 # μ€μ μΆλ ₯
556 print("\nμ€μ :")
557 for key, value in config.items():
558 print(f" {key}: {value}")
559
560 # νμ΅
561 agent, logger = train_agent(config)
562
563 # νκ°
564 returns = evaluate_agent(agent, n_episodes=20)
565
566 print("\nνλ‘μ νΈ μλ£!")
567 print("\nμμ±λ νμΌ:")
568 print(f" - {config['log_dir']}/training_log.json")
569 print(f" - {config['log_dir']}/training_progress.png")
570 print(f" - {config['checkpoint_dir']}/agent_final.pt")
571
572 print("\nμ£Όμ νμ΅ λ΄μ©:")
573 print(" 1. μ¬μ©μ μ μ νκ²½ ꡬν (Gymnasium μ€νμΌ)")
574 print(" 2. λͺ¨λνλ PPO μμ΄μ νΈ")
575 print(" 3. νμ΅ νμ΄νλΌμΈ (μμ§-μ
λ°μ΄νΈ-λ‘κΉ
)")
576 print(" 4. λͺ¨λΈ μ μ₯/λ‘λ")
577 print(" 5. νκ° λ° μκ°ν")
578 print(" 6. νλ‘μ νΈ κ΅¬μ‘° λͺ¨λ² μ¬λ‘")