06. Q-Learning and SARSA
06. Q-Learning and SARSA¶
Difficulty: ⭐⭐⭐ (Intermediate)
Learning Objectives¶
- Understand Q-Learning principles and off-policy characteristics
- Understand SARSA principles and on-policy characteristics
- Compare differences between Q-Learning and SARSA
- Implement epsilon-greedy exploration strategy
- Learn convergence conditions and practical tips
1. Action-Value Function (Q Function)¶
1.1 Definition of Q Function¶
A function that evaluates the value of state-action pairs.
$$Q(s, a) = \mathbb{E}[G_t | S_t = s, A_t = a]$$
import numpy as np
class QTable:
def __init__(self, n_states, n_actions):
# Initialize Q table (0 or small random values)
self.q_table = np.zeros((n_states, n_actions))
def get_q(self, state, action):
return self.q_table[state, action]
def get_best_action(self, state):
return np.argmax(self.q_table[state])
def update(self, state, action, value):
self.q_table[state, action] = value
1.2 Relationship between V and Q¶
V(s) = max_a Q(s, a) # For optimal policy
V(s) = Σ_a π(a|s) Q(s, a) # For general policy
2. Q-Learning (Off-Policy TD)¶
2.1 Q-Learning Algorithm¶
Off-Policy: Behavior policy and target policy are different
Q(s, a) ← Q(s, a) + α[r + γ max_a' Q(s', a') - Q(s, a)]
- Behavior policy: ε-greedy (for exploration)
- Target policy: greedy (for learning)
class QLearning:
def __init__(self, n_states, n_actions, alpha=0.1, gamma=0.99, epsilon=0.1):
self.q_table = np.zeros((n_states, n_actions))
self.alpha = alpha # Learning rate
self.gamma = gamma # Discount factor
self.epsilon = epsilon # Exploration rate
self.n_actions = n_actions
def choose_action(self, state):
"""ε-greedy action selection"""
if np.random.random() < self.epsilon:
return np.random.randint(self.n_actions) # Explore
return np.argmax(self.q_table[state]) # Exploit
def update(self, state, action, reward, next_state, done):
"""Q-Learning update"""
if done:
target = reward
else:
# Off-policy: Use max Q value for next state
target = reward + self.gamma * np.max(self.q_table[next_state])
# TD update
td_error = target - self.q_table[state, action]
self.q_table[state, action] += self.alpha * td_error
return td_error
2.2 Q-Learning Training Loop¶
def train_qlearning(env, agent, n_episodes=1000):
rewards_history = []
for episode in range(n_episodes):
state = env.reset()
total_reward = 0
done = False
while not done:
# Choose action with ε-greedy
action = agent.choose_action(state)
# Step in environment
next_state, reward, done, _ = env.step(action)
# Update Q table (independent of next action)
agent.update(state, action, reward, next_state, done)
state = next_state
total_reward += reward
rewards_history.append(total_reward)
# Epsilon decay (optional)
agent.epsilon = max(0.01, agent.epsilon * 0.995)
return rewards_history
3. SARSA (On-Policy TD)¶
3.1 SARSA Algorithm¶
On-Policy: Behavior policy and target policy are the same
Name origin: State, Action, Reward, State, Action
Q(s, a) ← Q(s, a) + α[r + γ Q(s', a') - Q(s, a)]
Where a' is the next action actually selected.
class SARSA:
def __init__(self, n_states, n_actions, alpha=0.1, gamma=0.99, epsilon=0.1):
self.q_table = np.zeros((n_states, n_actions))
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
self.n_actions = n_actions
def choose_action(self, state):
"""ε-greedy policy"""
if np.random.random() < self.epsilon:
return np.random.randint(self.n_actions)
return np.argmax(self.q_table[state])
def update(self, state, action, reward, next_state, next_action, done):
"""SARSA update"""
if done:
target = reward
else:
# On-policy: Use Q value of actual next action
target = reward + self.gamma * self.q_table[next_state, next_action]
td_error = target - self.q_table[state, action]
self.q_table[state, action] += self.alpha * td_error
return td_error
3.2 SARSA Training Loop¶
def train_sarsa(env, agent, n_episodes=1000):
rewards_history = []
for episode in range(n_episodes):
state = env.reset()
action = agent.choose_action(state) # Select initial action
total_reward = 0
done = False
while not done:
# Step in environment
next_state, reward, done, _ = env.step(action)
# Select next action (before update)
next_action = agent.choose_action(next_state)
# SARSA update (needs next action)
agent.update(state, action, reward, next_state, next_action, done)
state = next_state
action = next_action
total_reward += reward
rewards_history.append(total_reward)
return rewards_history
4. Q-Learning vs SARSA Comparison¶
4.1 Key Differences¶
| Feature | Q-Learning | SARSA |
|---|---|---|
| Policy Type | Off-policy | On-policy |
| Target Calculation | max Q(s', a') | Q(s', a') |
| Learning Target | Optimal policy | Current policy |
| Exploration Impact | No impact on learning | Direct impact on learning |
| Safety | More aggressive | More conservative |
4.2 Cliff Walking Example¶
[S][.][.][.][.][.][.][.][.][.][.][G]
[C][C][C][C][C][C][C][C][C][C][C][C]
S: Start, G: Goal, C: Cliff (large negative reward)
def cliff_walking_comparison():
"""
Q-Learning: Prefers shortest path along cliff edge (risky but fast)
SARSA: Prefers safe path away from cliff (slow but safe)
"""
# Q-Learning learns optimal path but
# may fall off cliff during ε-greedy exploration
# SARSA considers exploration and
# learns path away from cliff
pass
4.3 Visualization Comparison¶
import matplotlib.pyplot as plt
def compare_algorithms(env, n_episodes=500, n_runs=10):
q_rewards = np.zeros((n_runs, n_episodes))
sarsa_rewards = np.zeros((n_runs, n_episodes))
for run in range(n_runs):
q_agent = QLearning(env.n_states, env.n_actions)
sarsa_agent = SARSA(env.n_states, env.n_actions)
q_rewards[run] = train_qlearning(env, q_agent, n_episodes)
sarsa_rewards[run] = train_sarsa(env, sarsa_agent, n_episodes)
# Mean and standard deviation
plt.figure(figsize=(10, 6))
q_mean = q_rewards.mean(axis=0)
sarsa_mean = sarsa_rewards.mean(axis=0)
plt.plot(q_mean, label='Q-Learning', alpha=0.8)
plt.plot(sarsa_mean, label='SARSA', alpha=0.8)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Q-Learning vs SARSA')
plt.legend()
plt.show()
5. Exploration Strategies¶
5.1 Epsilon-Greedy¶
def epsilon_greedy(q_values, epsilon):
if np.random.random() < epsilon:
return np.random.randint(len(q_values))
return np.argmax(q_values)
# Epsilon decay schedule
def get_epsilon(episode, min_eps=0.01, max_eps=1.0, decay=0.995):
return max(min_eps, max_eps * (decay ** episode))
5.2 Softmax (Boltzmann) Exploration¶
def softmax_action(q_values, temperature=1.0):
"""Probabilistic action selection by temperature"""
exp_q = np.exp(q_values / temperature)
probs = exp_q / np.sum(exp_q)
return np.random.choice(len(q_values), p=probs)
5.3 UCB (Upper Confidence Bound)¶
class UCBAgent:
def __init__(self, n_states, n_actions, c=2.0):
self.q_table = np.zeros((n_states, n_actions))
self.n_visits = np.zeros((n_states, n_actions))
self.total_visits = np.zeros(n_states)
self.c = c
def choose_action(self, state):
self.total_visits[state] += 1
# Select unvisited actions first
if 0 in self.n_visits[state]:
return np.argmin(self.n_visits[state])
# Calculate UCB values
ucb_values = self.q_table[state] + self.c * np.sqrt(
np.log(self.total_visits[state]) / self.n_visits[state]
)
return np.argmax(ucb_values)
6. Expected SARSA¶
6.1 Concept¶
A middle form between SARSA and Q-Learning, using expected value for next state.
Q(s, a) ← Q(s, a) + α[r + γ Σ_a' π(a'|s') Q(s', a') - Q(s, a)]
class ExpectedSARSA:
def __init__(self, n_states, n_actions, alpha=0.1, gamma=0.99, epsilon=0.1):
self.q_table = np.zeros((n_states, n_actions))
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
self.n_actions = n_actions
def get_policy_probs(self, state):
"""Probability distribution of ε-greedy policy"""
probs = np.ones(self.n_actions) * self.epsilon / self.n_actions
best_action = np.argmax(self.q_table[state])
probs[best_action] += 1 - self.epsilon
return probs
def update(self, state, action, reward, next_state, done):
if done:
target = reward
else:
# Calculate expected value
probs = self.get_policy_probs(next_state)
expected_q = np.sum(probs * self.q_table[next_state])
target = reward + self.gamma * expected_q
td_error = target - self.q_table[state, action]
self.q_table[state, action] += self.alpha * td_error
7. Convergence and Hyperparameters¶
7.1 Convergence Conditions¶
Conditions for Q-Learning to converge to optimal Q*:
- Visit all state-action pairs infinitely
- Learning rate conditions: Σ α = ∞, Σ α² < ∞ (e.g., α = 1/n)
- Bounded rewards
7.2 Hyperparameter Tuning¶
# General starting point
config = {
'alpha': 0.1, # Learning rate: 0.01 ~ 0.5
'gamma': 0.99, # Discount factor: 0.9 ~ 0.999
'epsilon': 1.0, # Initial exploration rate
'epsilon_min': 0.01, # Minimum exploration rate
'epsilon_decay': 0.995 # Decay rate
}
# Learning rate scheduling
def learning_rate_schedule(episode, initial_lr=0.5, decay=0.001):
return initial_lr / (1 + decay * episode)
8. Practice: FrozenLake¶
import gymnasium as gym
def train_frozen_lake():
env = gym.make('FrozenLake-v1', is_slippery=True)
agent = QLearning(
n_states=env.observation_space.n,
n_actions=env.action_space.n,
alpha=0.1,
gamma=0.99,
epsilon=1.0
)
n_episodes = 10000
rewards = []
for episode in range(n_episodes):
state, _ = env.reset()
total_reward = 0
done = False
while not done:
action = agent.choose_action(state)
next_state, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated
agent.update(state, action, reward, next_state, done)
state = next_state
total_reward += reward
rewards.append(total_reward)
agent.epsilon = max(0.01, agent.epsilon * 0.9995)
if (episode + 1) % 1000 == 0:
avg_reward = np.mean(rewards[-100:])
print(f"Episode {episode + 1}, Avg Reward: {avg_reward:.3f}")
return agent, rewards
if __name__ == "__main__":
agent, rewards = train_frozen_lake()
Summary¶
| Algorithm | Target | Policy | Characteristics |
|---|---|---|---|
| Q-Learning | max Q(s',a') | Off-policy | Learn optimal policy |
| SARSA | Q(s',a') | On-policy | Evaluate current policy |
| Expected SARSA | E[Q(s',a')] | Off-policy | Lower variance |
Key Points: - Q-Learning directly learns optimal policy - SARSA learns safely considering exploration - Proper exploration-exploitation balance is crucial
Next Steps¶
- 07_Deep_Q_Network.md - Combining neural networks with Q-Learning
References¶
- Sutton & Barto, "Reinforcement Learning: An Introduction", Chapter 6
- David Silver's RL Course, Lecture 5: Model-Free Control
- Gymnasium FrozenLake