08. ์ ์ฑ ๊ฒฝ์ฌ (Policy Gradient)
08. ์ ์ฑ ๊ฒฝ์ฌ (Policy Gradient)¶
๋์ด๋: โญโญโญโญ (๊ณ ๊ธ)
ํ์ต ๋ชฉํ¶
- ์ ์ฑ ๊ธฐ๋ฐ ๋ฐฉ๋ฒ์ ์ฅ๋จ์ ์ดํด
- ์ ์ฑ ๊ฒฝ์ฌ ์ ๋ฆฌ (Policy Gradient Theorem) ์ ๋
- REINFORCE ์๊ณ ๋ฆฌ์ฆ ๊ตฌํ
- Baseline์ ํตํ ๋ถ์ฐ ๊ฐ์ ๊ธฐ๋ฒ
- Actor-Critic์ผ๋ก์ ์ฐ๊ฒฐ
1. ๊ฐ์น ๊ธฐ๋ฐ vs ์ ์ฑ ๊ธฐ๋ฐ¶
1.1 ๋น๊ต¶
| ํน์ฑ | ๊ฐ์น ๊ธฐ๋ฐ (DQN) | ์ ์ฑ ๊ธฐ๋ฐ |
|---|---|---|
| ํ์ต ๋์ | Q(s, a) | ฯ(a|s) |
| ์ ์ฑ ๋์ถ | Q์์ ๊ฐ์ ์ ๋ | ์ง์ ํ์ต |
| ํ๋ ๊ณต๊ฐ | ์ด์ฐ (์ฃผ๋ก) | ์ด์ฐ + ์ฐ์ |
| ํ๋ฅ ์ ์ ์ฑ | ์ด๋ ค์ | ์์ฐ์ค๋ฌ์ |
| ์๋ ด | ๋ถ์์ ๊ฐ๋ฅ | ์ง์ญ ์ต์ |
1.2 ์ ์ฑ ๊ธฐ๋ฐ์ ์ฅ์ ¶
1. ์ฐ์ ํ๋ ๊ณต๊ฐ ์ฒ๋ฆฌ ๊ฐ๋ฅ (๋ก๋ด ์ ์ด)
2. ํ๋ฅ ์ ์ ์ฑ
ํ์ต ๊ฐ๋ฅ (๊ฐ์๋ฐ์๋ณด)
3. ์ ์ฑ
๊ณต๊ฐ์ด ๋ ๋จ์ํ ์ ์์
4. ๋ ๋์ ์๋ ด ๋ณด์ฅ (์ผ๋ถ ๊ฒฝ์ฐ)
2. ์ ์ฑ ์ ํ๋ผ๋ฏธํฐํ¶
2.1 ์ํํธ๋งฅ์ค ์ ์ฑ (์ด์ฐ ํ๋)¶
import torch
import torch.nn as nn
import torch.nn.functional as F
class DiscretePolicy(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim=128):
super().__init__()
self.network = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, action_dim)
)
def forward(self, state):
logits = self.network(state)
return F.softmax(logits, dim=-1)
def get_action(self, state):
probs = self.forward(state)
dist = torch.distributions.Categorical(probs)
action = dist.sample()
log_prob = dist.log_prob(action)
return action.item(), log_prob
2.2 ๊ฐ์ฐ์์ ์ ์ฑ (์ฐ์ ํ๋)¶
class GaussianPolicy(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim=128):
super().__init__()
self.shared = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU()
)
self.mean_layer = nn.Linear(hidden_dim, action_dim)
self.log_std = nn.Parameter(torch.zeros(action_dim))
def forward(self, state):
features = self.shared(state)
mean = self.mean_layer(features)
std = self.log_std.exp()
return mean, std
def get_action(self, state):
mean, std = self.forward(state)
dist = torch.distributions.Normal(mean, std)
action = dist.sample()
log_prob = dist.log_prob(action).sum(-1)
return action, log_prob
3. ์ ์ฑ ๊ฒฝ์ฌ ์ ๋ฆฌ¶
3.1 ๋ชฉํ ํจ์¶
์ ์ฑ ฯ_ฮธ์ ์ฑ๋ฅ์ ์ต๋ํ:
$$J(\theta) = \mathbb{E}_{\tau \sim \pi_\theta} [R(\tau)]$$
์ฌ๊ธฐ์ ฯ = (sโ, aโ, rโ, sโ, aโ, rโ, ...) ๋ ๊ถค์ (trajectory)
3.2 ์ ์ฑ ๊ฒฝ์ฌ ์ ๋ฆฌ¶
$$\nabla_\theta J(\theta) = \mathbb{E}_{\tau \sim \pi_\theta} \left[ \sum_{t=0}^{T} \nabla_\theta \log \pi_\theta(a_t|s_t) \cdot G_t \right]$$
์ง๊ด์ ํด์: - ์ข์ ๊ฒฐ๊ณผ(๋์ G_t)๋ฅผ ๊ฐ์ ธ์จ ํ๋์ ํ๋ฅ ์ ๋์ - ๋์ ๊ฒฐ๊ณผ๋ฅผ ๊ฐ์ ธ์จ ํ๋์ ํ๋ฅ ์ ๋ฎ์ถค
3.3 ์ ๋ (Log-derivative trick)¶
โ_ฮธ ฯ(a|s;ฮธ) = ฯ(a|s;ฮธ) ยท โ_ฮธ log ฯ(a|s;ฮธ)
๋ฐ๋ผ์:
โ_ฮธ J(ฮธ) = E[R ยท โ_ฮธ log ฯ(a|s;ฮธ)]
= E[โ_ฮธ log ฯ(a|s;ฮธ) ยท R]
4. REINFORCE ์๊ณ ๋ฆฌ์ฆ¶
4.1 ๊ธฐ๋ณธ REINFORCE¶
๋ชฌํ ์นด๋ฅผ๋ก ์ ์ฑ ๊ฒฝ์ฌ ๋ฐฉ๋ฒ์ ๋๋ค.
class REINFORCE:
def __init__(self, state_dim, action_dim, lr=1e-3, gamma=0.99):
self.policy = DiscretePolicy(state_dim, action_dim)
self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr)
self.gamma = gamma
# ์ํผ์๋ ์ ์ฅ
self.log_probs = []
self.rewards = []
def choose_action(self, state):
state_tensor = torch.FloatTensor(state).unsqueeze(0)
action, log_prob = self.policy.get_action(state_tensor)
self.log_probs.append(log_prob)
return action
def store_reward(self, reward):
self.rewards.append(reward)
def compute_returns(self):
"""ํ ์ธ๋ ๋ฆฌํด ๊ณ์ฐ"""
returns = []
G = 0
for r in reversed(self.rewards):
G = r + self.gamma * G
returns.insert(0, G)
returns = torch.tensor(returns)
# ์ ๊ทํ (์ ํ์ ์ด์ง๋ง ๊ถ์ฅ)
returns = (returns - returns.mean()) / (returns.std() + 1e-8)
return returns
def update(self):
returns = self.compute_returns()
# ์ ์ฑ
์์ค ๊ณ์ฐ
policy_loss = []
for log_prob, G in zip(self.log_probs, returns):
policy_loss.append(-log_prob * G) # ์์ (๊ฒฝ์ฌ ์์น)
loss = torch.stack(policy_loss).sum()
# ์
๋ฐ์ดํธ
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# ์ํผ์๋ ๋ฐ์ดํฐ ์ด๊ธฐํ
self.log_probs = []
self.rewards = []
return loss.item()
4.2 ํ์ต ๋ฃจํ¶
import gymnasium as gym
import numpy as np
def train_reinforce(env_name='CartPole-v1', n_episodes=1000):
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = REINFORCE(state_dim, action_dim, lr=1e-3)
scores = []
for episode in range(n_episodes):
state, _ = env.reset()
total_reward = 0
done = False
while not done:
action = agent.choose_action(state)
next_state, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated
agent.store_reward(reward)
state = next_state
total_reward += reward
# ์ํผ์๋ ์ข
๋ฃ ํ ์
๋ฐ์ดํธ
loss = agent.update()
scores.append(total_reward)
if (episode + 1) % 100 == 0:
print(f"Episode {episode + 1}, Avg Score: {np.mean(scores[-100:]):.2f}")
return agent, scores
5. Baseline์ ํตํ ๋ถ์ฐ ๊ฐ์¶
5.1 ๋ถ์ฐ ๋ฌธ์ ¶
REINFORCE์ ๊ทธ๋๋์ธํธ๋ ๋์ ๋ถ์ฐ์ ๊ฐ์ง๋๋ค.
Var(โ_ฮธ J) โ E[(G - b)ยฒ]
5.2 Baseline ๋์ ¶
์์ b๋ฅผ ๋นผ๋ ๊ธฐ๋๊ฐ์ ๋ณํ์ง ์์ง๋ง ๋ถ์ฐ์ ๊ฐ์ํฉ๋๋ค.
$$\nabla_\theta J(\theta) = \mathbb{E} \left[ \nabla_\theta \log \pi_\theta(a_t|s_t) \cdot (G_t - b) \right]$$
๊ฐ์ฅ ์ข์ baseline: b = V(s)
class REINFORCEWithBaseline:
def __init__(self, state_dim, action_dim, lr_policy=1e-3, lr_value=1e-3, gamma=0.99):
self.policy = DiscretePolicy(state_dim, action_dim)
self.value = nn.Sequential(
nn.Linear(state_dim, 128),
nn.ReLU(),
nn.Linear(128, 128),
nn.ReLU(),
nn.Linear(128, 1)
)
self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr_policy)
self.value_optimizer = torch.optim.Adam(self.value.parameters(), lr=lr_value)
self.gamma = gamma
self.log_probs = []
self.values = []
self.rewards = []
self.states = []
def choose_action(self, state):
state_tensor = torch.FloatTensor(state).unsqueeze(0)
# ์ ์ฑ
์์ ํ๋ ์ํ๋ง
action, log_prob = self.policy.get_action(state_tensor)
# ๊ฐ์น ์์ธก
value = self.value(state_tensor)
self.log_probs.append(log_prob)
self.values.append(value)
self.states.append(state_tensor)
return action
def update(self):
returns = self.compute_returns()
values = torch.cat(self.values).squeeze()
log_probs = torch.stack(self.log_probs)
# Advantage = Return - Baseline (Value)
advantages = returns - values.detach()
# ์ ์ฑ
์์ค
policy_loss = -(log_probs * advantages).mean()
# ๊ฐ์น ์์ค
value_loss = F.mse_loss(values, returns)
# ์ ์ฑ
์
๋ฐ์ดํธ
self.policy_optimizer.zero_grad()
policy_loss.backward()
self.policy_optimizer.step()
# ๊ฐ์น ์
๋ฐ์ดํธ
self.value_optimizer.zero_grad()
value_loss.backward()
self.value_optimizer.step()
# ์ด๊ธฐํ
self.log_probs = []
self.values = []
self.rewards = []
self.states = []
return policy_loss.item(), value_loss.item()
6. ์ฐ์ ํ๋ ๊ณต๊ฐ ์์ ¶
6.1 ์ฐ์ ํ๋ REINFORCE¶
class ContinuousREINFORCE:
def __init__(self, state_dim, action_dim, lr=1e-3, gamma=0.99):
self.policy = GaussianPolicy(state_dim, action_dim)
self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr)
self.gamma = gamma
self.log_probs = []
self.rewards = []
def choose_action(self, state):
state_tensor = torch.FloatTensor(state).unsqueeze(0)
action, log_prob = self.policy.get_action(state_tensor)
self.log_probs.append(log_prob)
return action.detach().numpy().squeeze()
def update(self):
returns = self.compute_returns()
policy_loss = []
for log_prob, G in zip(self.log_probs, returns):
policy_loss.append(-log_prob * G)
loss = torch.stack(policy_loss).sum()
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.log_probs = []
self.rewards = []
6.2 MountainCarContinuous ์์ ¶
def train_continuous():
env = gym.make('MountainCarContinuous-v0')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
agent = ContinuousREINFORCE(state_dim, action_dim, lr=1e-3)
for episode in range(500):
state, _ = env.reset()
total_reward = 0
while True:
action = agent.choose_action(state)
action = np.clip(action, env.action_space.low, env.action_space.high)
next_state, reward, done, truncated, _ = env.step(action)
agent.rewards.append(reward)
state = next_state
total_reward += reward
if done or truncated:
break
agent.update()
print(f"Episode {episode + 1}, Reward: {total_reward:.2f}")
7. ๊ณ ๊ธ ๊ธฐ๋ฒ¶
7.1 ์ํธ๋กํผ ์ ๊ทํ¶
ํ์์ ์ฅ๋ คํ๊ธฐ ์ํด ์ ์ฑ ์ ์ํธ๋กํผ๋ฅผ ์์ค์ ์ถ๊ฐํฉ๋๋ค.
def compute_entropy(probs):
"""์ ์ฑ
์ ์ํธ๋กํผ ๊ณ์ฐ"""
return -(probs * probs.log()).sum(dim=-1).mean()
# ์์ค ํจ์
total_loss = policy_loss - entropy_coef * entropy
7.2 Reward Shaping¶
ํฌ์ ๋ณด์ ๋ฌธ์ ๋ฅผ ํด๊ฒฐํ๊ธฐ ์ํ ๋ณด์ ๋ณํ:
def shape_reward(reward, state, next_state, done):
"""๋ณด์ ํ์ฑ ์์"""
# ์๋ ๋ณด์์ ์ถ๊ฐ์ ์ธ ์๊ทธ๋
position_reward = abs(next_state[0] - state[0]) # ์์ง์ ์ฅ๋ ค
if done and reward > 0:
bonus = 100 # ๋ชฉํ ๋ฌ์ฑ ๋ณด๋์ค
else:
bonus = 0
return reward + 0.1 * position_reward + bonus
8. REINFORCE์ ํ๊ณ¶
8.1 ๋ฌธ์ ์ ¶
- ๋์ ๋ถ์ฐ: ์ํผ์๋ ์ ์ฒด๋ฅผ ์ฌ์ฉํ๋ฏ๋ก ๋ถ์ฐ์ด ํผ
- ์ํ ๋นํจ์จ: ์ํผ์๋ ์ข ๋ฃ๊น์ง ๊ธฐ๋ค๋ ค์ผ ํจ
- ํฌ๋ ๋ง ํ ๋น: ์ด๋ค ํ๋์ด ์ข์ ๊ฒฐ๊ณผ๋ฅผ ๊ฐ์ ธ์๋์ง ํ์ ์ด๋ ค์
8.2 ํด๊ฒฐ์ฑ โ Actor-Critic¶
- TD ํ์ต๊ณผ ์ ์ฑ ๊ฒฝ์ฌ์ ๊ฒฐํฉ
- ๋ถํธ์คํธ๋ํ์ผ๋ก ๋ถ์ฐ ๊ฐ์
- ์คํ ๋ง๋ค ์ ๋ฐ์ดํธ ๊ฐ๋ฅ
์์ฝ¶
| ์๊ณ ๋ฆฌ์ฆ | ์ ๋ฐ์ดํธ ์์ | Baseline | ํน์ง |
|---|---|---|---|
| REINFORCE | ์ํผ์๋ ์ข ๋ฃ | ์์ | ๋จ์, ๋์ ๋ถ์ฐ |
| REINFORCE + Baseline | ์ํผ์๋ ์ข ๋ฃ | V(s) | ๋ฎ์ ๋ถ์ฐ |
| Actor-Critic | ๋งค ์คํ | V(s) ๋๋ Q(s,a) | ํจ์จ์ |
ํต์ฌ ๊ณต์:
โ_ฮธ J(ฮธ) = E[โ_ฮธ log ฯ_ฮธ(a|s) ยท (G - b)]
๋ค์ ๋จ๊ณ¶
- 09_Actor_Critic.md - Actor-Critic ๋ฐฉ๋ฒ๋ก