06. Q-Learning๊ณผ SARSA
06. Q-Learning๊ณผ SARSA¶
๋์ด๋: โญโญโญ (์ค๊ธ)
ํ์ต ๋ชฉํ¶
- Q-Learning์ ์๋ฆฌ์ off-policy ํน์ฑ ์ดํด
- SARSA์ ์๋ฆฌ์ on-policy ํน์ฑ ์ดํด
- Q-Learning vs SARSA ์ฐจ์ด์ ๋น๊ต
- Epsilon-greedy ํ์ ์ ๋ต ๊ตฌํ
- ์๋ ด ์กฐ๊ฑด๊ณผ ์ค์ ์ ์ฉ ํ
1. ํ๋ ๊ฐ์น ํจ์ (Q ํจ์)¶
1.1 Q ํจ์์ ์ ์¶
์ํ-ํ๋ ์์ ๊ฐ์น๋ฅผ ํ๊ฐํ๋ ํจ์์ ๋๋ค.
$$Q(s, a) = \mathbb{E}[G_t | S_t = s, A_t = a]$$
import numpy as np
class QTable:
def __init__(self, n_states, n_actions):
# Q ํ
์ด๋ธ ์ด๊ธฐํ (0 ๋๋ ์์ ๋๋ค๊ฐ)
self.q_table = np.zeros((n_states, n_actions))
def get_q(self, state, action):
return self.q_table[state, action]
def get_best_action(self, state):
return np.argmax(self.q_table[state])
def update(self, state, action, value):
self.q_table[state, action] = value
1.2 V์ Q์ ๊ด๊ณ¶
V(s) = max_a Q(s, a) # ์ต์ ์ ์ฑ
์์
V(s) = ฮฃ_a ฯ(a|s) Q(s, a) # ์ผ๋ฐ ์ ์ฑ
์์
2. Q-Learning (Off-Policy TD)¶
2.1 Q-Learning ์๊ณ ๋ฆฌ์ฆ¶
Off-Policy: ํ๋ ์ ์ฑ (behavior policy)๊ณผ ํ๊ฒ ์ ์ฑ (target policy)์ด ๋ค๋ฆ
Q(s, a) โ Q(s, a) + ฮฑ[r + ฮณ max_a' Q(s', a') - Q(s, a)]
- ํ๋ ์ ์ฑ : ฮต-greedy (ํ์์ฉ)
- ํ๊ฒ ์ ์ฑ : greedy (ํ์ต์ฉ)
class QLearning:
def __init__(self, n_states, n_actions, alpha=0.1, gamma=0.99, epsilon=0.1):
self.q_table = np.zeros((n_states, n_actions))
self.alpha = alpha # ํ์ต๋ฅ
self.gamma = gamma # ํ ์ธ์จ
self.epsilon = epsilon # ํ์๋ฅ
self.n_actions = n_actions
def choose_action(self, state):
"""ฮต-greedy ์ ์ฑ
์ผ๋ก ํ๋ ์ ํ"""
if np.random.random() < self.epsilon:
return np.random.randint(self.n_actions) # ํ์
return np.argmax(self.q_table[state]) # ํ์ฉ
def update(self, state, action, reward, next_state, done):
"""Q-Learning ์
๋ฐ์ดํธ"""
if done:
target = reward
else:
# Off-policy: ๋ค์ ์ํ์์ ์ต๋ Q๊ฐ ์ฌ์ฉ
target = reward + self.gamma * np.max(self.q_table[next_state])
# TD ์
๋ฐ์ดํธ
td_error = target - self.q_table[state, action]
self.q_table[state, action] += self.alpha * td_error
return td_error
2.2 Q-Learning ํ์ต ๋ฃจํ¶
def train_qlearning(env, agent, n_episodes=1000):
rewards_history = []
for episode in range(n_episodes):
state = env.reset()
total_reward = 0
done = False
while not done:
# ฮต-greedy๋ก ํ๋ ์ ํ
action = agent.choose_action(state)
# ํ๊ฒฝ์์ ํ ์คํ
์งํ
next_state, reward, done, _ = env.step(action)
# Q ํ
์ด๋ธ ์
๋ฐ์ดํธ (๋ค์ ํ๋๊ณผ ๋ฌด๊ด)
agent.update(state, action, reward, next_state, done)
state = next_state
total_reward += reward
rewards_history.append(total_reward)
# Epsilon decay (์ ํ์ )
agent.epsilon = max(0.01, agent.epsilon * 0.995)
return rewards_history
3. SARSA (On-Policy TD)¶
3.1 SARSA ์๊ณ ๋ฆฌ์ฆ¶
On-Policy: ํ๋ ์ ์ฑ ๊ณผ ํ๊ฒ ์ ์ฑ ์ด ๋์ผ
์ด๋ฆ์ ์ ๋: State, Action, Reward, State, Action
Q(s, a) โ Q(s, a) + ฮฑ[r + ฮณ Q(s', a') - Q(s, a)]
์ฌ๊ธฐ์ a'๋ ์ค์ ๋ก ์ ํ๋ ๋ค์ ํ๋์ ๋๋ค.
class SARSA:
def __init__(self, n_states, n_actions, alpha=0.1, gamma=0.99, epsilon=0.1):
self.q_table = np.zeros((n_states, n_actions))
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
self.n_actions = n_actions
def choose_action(self, state):
"""ฮต-greedy ์ ์ฑ
"""
if np.random.random() < self.epsilon:
return np.random.randint(self.n_actions)
return np.argmax(self.q_table[state])
def update(self, state, action, reward, next_state, next_action, done):
"""SARSA ์
๋ฐ์ดํธ"""
if done:
target = reward
else:
# On-policy: ์ค์ ๋ค์ ํ๋์ Q๊ฐ ์ฌ์ฉ
target = reward + self.gamma * self.q_table[next_state, next_action]
td_error = target - self.q_table[state, action]
self.q_table[state, action] += self.alpha * td_error
return td_error
3.2 SARSA ํ์ต ๋ฃจํ¶
def train_sarsa(env, agent, n_episodes=1000):
rewards_history = []
for episode in range(n_episodes):
state = env.reset()
action = agent.choose_action(state) # ์ด๊ธฐ ํ๋ ์ ํ
total_reward = 0
done = False
while not done:
# ํ๊ฒฝ์์ ํ ์คํ
์งํ
next_state, reward, done, _ = env.step(action)
# ๋ค์ ํ๋ ์ ํ (์
๋ฐ์ดํธ ์ ์)
next_action = agent.choose_action(next_state)
# SARSA ์
๋ฐ์ดํธ (๋ค์ ํ๋ ํ์)
agent.update(state, action, reward, next_state, next_action, done)
state = next_state
action = next_action
total_reward += reward
rewards_history.append(total_reward)
return rewards_history
4. Q-Learning vs SARSA ๋น๊ต¶
4.1 ํต์ฌ ์ฐจ์ด์ ¶
| ํน์ฑ | Q-Learning | SARSA |
|---|---|---|
| ์ ์ฑ ์ ํ | Off-policy | On-policy |
| ํ๊ฒ ๊ณ์ฐ | max Q(s', a') | Q(s', a') |
| ํ์ต ๋์ | ์ต์ ์ ์ฑ | ํ์ฌ ์ ์ฑ |
| ํ์ ์ํฅ | ํ์ต์ ์ํฅ ์์ | ํ์ต์ ์ง์ ์ํฅ |
| ์์ ์ฑ | ๋ ๊ณต๊ฒฉ์ | ๋ ๋ณด์์ |
4.2 Cliff Walking ์์ ¶
[S][.][.][.][.][.][.][.][.][.][.][G]
[C][C][C][C][C][C][C][C][C][C][C][C]
S: ์์, G: ๋ชฉํ, C: ์ ๋ฒฝ (ํฐ ์์ ๋ณด์)
def cliff_walking_comparison():
"""
Q-Learning: ์ ๋ฒฝ ๊ฐ์ฅ์๋ฆฌ์ ์ต๋จ ๊ฒฝ๋ก ์ ํธ (์ํํ์ง๋ง ๋น ๋ฆ)
SARSA: ์ ๋ฒฝ์์ ๋จ์ด์ง ์์ ํ ๊ฒฝ๋ก ์ ํธ (๋๋ฆฌ์ง๋ง ์์ )
"""
# Q-Learning์ ์ต์ ๊ฒฝ๋ก๋ฅผ ํ์ตํ์ง๋ง
# ฮต-greedy ํ์ ์ค ์ ๋ฒฝ์ผ๋ก ๋จ์ด์ง ์ ์์
# SARSA๋ ํ์์ ๊ณ ๋ คํ์ฌ
# ์ ๋ฒฝ์์ ๋จ์ด์ง ๊ฒฝ๋ก๋ฅผ ํ์ต
pass
4.3 ์๊ฐํ ๋น๊ต¶
import matplotlib.pyplot as plt
def compare_algorithms(env, n_episodes=500, n_runs=10):
q_rewards = np.zeros((n_runs, n_episodes))
sarsa_rewards = np.zeros((n_runs, n_episodes))
for run in range(n_runs):
q_agent = QLearning(env.n_states, env.n_actions)
sarsa_agent = SARSA(env.n_states, env.n_actions)
q_rewards[run] = train_qlearning(env, q_agent, n_episodes)
sarsa_rewards[run] = train_sarsa(env, sarsa_agent, n_episodes)
# ํ๊ท ๋ฐ ํ์คํธ์ฐจ
plt.figure(figsize=(10, 6))
q_mean = q_rewards.mean(axis=0)
sarsa_mean = sarsa_rewards.mean(axis=0)
plt.plot(q_mean, label='Q-Learning', alpha=0.8)
plt.plot(sarsa_mean, label='SARSA', alpha=0.8)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Q-Learning vs SARSA')
plt.legend()
plt.show()
5. ํ์ ์ ๋ต (Exploration Strategies)¶
5.1 Epsilon-Greedy¶
def epsilon_greedy(q_values, epsilon):
if np.random.random() < epsilon:
return np.random.randint(len(q_values))
return np.argmax(q_values)
# Epsilon decay ์ค์ผ์ค
def get_epsilon(episode, min_eps=0.01, max_eps=1.0, decay=0.995):
return max(min_eps, max_eps * (decay ** episode))
5.2 Softmax (Boltzmann) ํ์¶
def softmax_action(q_values, temperature=1.0):
"""์จ๋์ ๋ฐ๋ฅธ ํ๋ฅ ์ ํ๋ ์ ํ"""
exp_q = np.exp(q_values / temperature)
probs = exp_q / np.sum(exp_q)
return np.random.choice(len(q_values), p=probs)
5.3 UCB (Upper Confidence Bound)¶
class UCBAgent:
def __init__(self, n_states, n_actions, c=2.0):
self.q_table = np.zeros((n_states, n_actions))
self.n_visits = np.zeros((n_states, n_actions))
self.total_visits = np.zeros(n_states)
self.c = c
def choose_action(self, state):
self.total_visits[state] += 1
# ๋ฐฉ๋ฌธํ์ง ์์ ํ๋์ด ์์ผ๋ฉด ์ ํ
if 0 in self.n_visits[state]:
return np.argmin(self.n_visits[state])
# UCB ๊ฐ ๊ณ์ฐ
ucb_values = self.q_table[state] + self.c * np.sqrt(
np.log(self.total_visits[state]) / self.n_visits[state]
)
return np.argmax(ucb_values)
6. Expected SARSA¶
6.1 ๊ฐ๋ ¶
SARSA์ Q-Learning์ ์ค๊ฐ ํํ๋ก, ๋ค์ ์ํ์์ ๊ธฐ๋๊ฐ์ ์ฌ์ฉํฉ๋๋ค.
Q(s, a) โ Q(s, a) + ฮฑ[r + ฮณ ฮฃ_a' ฯ(a'|s') Q(s', a') - Q(s, a)]
class ExpectedSARSA:
def __init__(self, n_states, n_actions, alpha=0.1, gamma=0.99, epsilon=0.1):
self.q_table = np.zeros((n_states, n_actions))
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
self.n_actions = n_actions
def get_policy_probs(self, state):
"""ฮต-greedy ์ ์ฑ
์ ํ๋ฅ ๋ถํฌ"""
probs = np.ones(self.n_actions) * self.epsilon / self.n_actions
best_action = np.argmax(self.q_table[state])
probs[best_action] += 1 - self.epsilon
return probs
def update(self, state, action, reward, next_state, done):
if done:
target = reward
else:
# ๊ธฐ๋๊ฐ ๊ณ์ฐ
probs = self.get_policy_probs(next_state)
expected_q = np.sum(probs * self.q_table[next_state])
target = reward + self.gamma * expected_q
td_error = target - self.q_table[state, action]
self.q_table[state, action] += self.alpha * td_error
7. ์๋ ด์ฑ๊ณผ ํ์ดํผํ๋ผ๋ฏธํฐ¶
7.1 ์๋ ด ์กฐ๊ฑด¶
Q-Learning์ด ์ต์ Q*๋ก ์๋ ดํ๊ธฐ ์ํ ์กฐ๊ฑด:
- ๋ชจ๋ ์ํ-ํ๋ ์์ ๋ฌดํํ ๋ฐฉ๋ฌธ
- ํ์ต๋ฅ ์กฐ๊ฑด: ฮฃ ฮฑ = โ, ฮฃ ฮฑยฒ < โ (์: ฮฑ = 1/n)
- ๋ณด์์ด ์ ๊ณ
7.2 ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋¶
# ์ผ๋ฐ์ ์ธ ์์์
config = {
'alpha': 0.1, # ํ์ต๋ฅ : 0.01 ~ 0.5
'gamma': 0.99, # ํ ์ธ์จ: 0.9 ~ 0.999
'epsilon': 1.0, # ์ด๊ธฐ ํ์๋ฅ
'epsilon_min': 0.01, # ์ต์ ํ์๋ฅ
'epsilon_decay': 0.995 # ๊ฐ์์จ
}
# ํ์ต๋ฅ ์ค์ผ์ค๋ง
def learning_rate_schedule(episode, initial_lr=0.5, decay=0.001):
return initial_lr / (1 + decay * episode)
8. ์ค์ต: FrozenLake¶
import gymnasium as gym
def train_frozen_lake():
env = gym.make('FrozenLake-v1', is_slippery=True)
agent = QLearning(
n_states=env.observation_space.n,
n_actions=env.action_space.n,
alpha=0.1,
gamma=0.99,
epsilon=1.0
)
n_episodes = 10000
rewards = []
for episode in range(n_episodes):
state, _ = env.reset()
total_reward = 0
done = False
while not done:
action = agent.choose_action(state)
next_state, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated
agent.update(state, action, reward, next_state, done)
state = next_state
total_reward += reward
rewards.append(total_reward)
agent.epsilon = max(0.01, agent.epsilon * 0.9995)
if (episode + 1) % 1000 == 0:
avg_reward = np.mean(rewards[-100:])
print(f"Episode {episode + 1}, Avg Reward: {avg_reward:.3f}")
return agent, rewards
if __name__ == "__main__":
agent, rewards = train_frozen_lake()
์์ฝ¶
| ์๊ณ ๋ฆฌ์ฆ | ํ๊ฒ | ์ ์ฑ | ํน์ง |
|---|---|---|---|
| Q-Learning | max Q(s',a') | Off-policy | ์ต์ ์ ์ฑ ํ์ต |
| SARSA | Q(s',a') | On-policy | ํ์ฌ ์ ์ฑ ํ๊ฐ |
| Expected SARSA | E[Q(s',a')] | Off-policy | ๋ฎ์ ๋ถ์ฐ |
ํต์ฌ ํฌ์ธํธ: - Q-Learning์ ์ต์ ์ ์ฑ ์ ์ง์ ํ์ต - SARSA๋ ํ์์ ๊ณ ๋ คํ ์์ ํ ํ์ต - ์ ์ ํ ํ์-ํ์ฉ ๊ท ํ์ด ์ค์
๋ค์ ๋จ๊ณ¶
- 07_Deep_Q_Network.md - ์ ๊ฒฝ๋ง๊ณผ Q-Learning์ ๊ฒฐํฉ