06. Q-Learning and SARSA

06. Q-Learning and SARSA

Difficulty: ⭐⭐⭐ (Intermediate)

Learning Objectives

  • Understand Q-Learning principles and off-policy characteristics
  • Understand SARSA principles and on-policy characteristics
  • Compare differences between Q-Learning and SARSA
  • Implement epsilon-greedy exploration strategy
  • Learn convergence conditions and practical tips

1. Action-Value Function (Q Function)

1.1 Definition of Q Function

A function that evaluates the value of state-action pairs.

$$Q(s, a) = \mathbb{E}[G_t | S_t = s, A_t = a]$$

import numpy as np

class QTable:
    def __init__(self, n_states, n_actions):
        # Initialize Q table (0 or small random values)
        self.q_table = np.zeros((n_states, n_actions))

    def get_q(self, state, action):
        return self.q_table[state, action]

    def get_best_action(self, state):
        return np.argmax(self.q_table[state])

    def update(self, state, action, value):
        self.q_table[state, action] = value

1.2 Relationship between V and Q

V(s) = max_a Q(s, a)           # For optimal policy
V(s) = Σ_a π(a|s) Q(s, a)      # For general policy

2. Q-Learning (Off-Policy TD)

2.1 Q-Learning Algorithm

Off-Policy: Behavior policy and target policy are different

Q(s, a) ← Q(s, a) + α[r + γ max_a' Q(s', a') - Q(s, a)]
  • Behavior policy: ε-greedy (for exploration)
  • Target policy: greedy (for learning)
class QLearning:
    def __init__(self, n_states, n_actions, alpha=0.1, gamma=0.99, epsilon=0.1):
        self.q_table = np.zeros((n_states, n_actions))
        self.alpha = alpha      # Learning rate
        self.gamma = gamma      # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.n_actions = n_actions

    def choose_action(self, state):
        """ε-greedy action selection"""
        if np.random.random() < self.epsilon:
            return np.random.randint(self.n_actions)  # Explore
        return np.argmax(self.q_table[state])          # Exploit

    def update(self, state, action, reward, next_state, done):
        """Q-Learning update"""
        if done:
            target = reward
        else:
            # Off-policy: Use max Q value for next state
            target = reward + self.gamma * np.max(self.q_table[next_state])

        # TD update
        td_error = target - self.q_table[state, action]
        self.q_table[state, action] += self.alpha * td_error

        return td_error

2.2 Q-Learning Training Loop

def train_qlearning(env, agent, n_episodes=1000):
    rewards_history = []

    for episode in range(n_episodes):
        state = env.reset()
        total_reward = 0
        done = False

        while not done:
            # Choose action with ε-greedy
            action = agent.choose_action(state)

            # Step in environment
            next_state, reward, done, _ = env.step(action)

            # Update Q table (independent of next action)
            agent.update(state, action, reward, next_state, done)

            state = next_state
            total_reward += reward

        rewards_history.append(total_reward)

        # Epsilon decay (optional)
        agent.epsilon = max(0.01, agent.epsilon * 0.995)

    return rewards_history

3. SARSA (On-Policy TD)

3.1 SARSA Algorithm

On-Policy: Behavior policy and target policy are the same

Name origin: State, Action, Reward, State, Action

Q(s, a) ← Q(s, a) + α[r + γ Q(s', a') - Q(s, a)]

Where a' is the next action actually selected.

class SARSA:
    def __init__(self, n_states, n_actions, alpha=0.1, gamma=0.99, epsilon=0.1):
        self.q_table = np.zeros((n_states, n_actions))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.n_actions = n_actions

    def choose_action(self, state):
        """ε-greedy policy"""
        if np.random.random() < self.epsilon:
            return np.random.randint(self.n_actions)
        return np.argmax(self.q_table[state])

    def update(self, state, action, reward, next_state, next_action, done):
        """SARSA update"""
        if done:
            target = reward
        else:
            # On-policy: Use Q value of actual next action
            target = reward + self.gamma * self.q_table[next_state, next_action]

        td_error = target - self.q_table[state, action]
        self.q_table[state, action] += self.alpha * td_error

        return td_error

3.2 SARSA Training Loop

def train_sarsa(env, agent, n_episodes=1000):
    rewards_history = []

    for episode in range(n_episodes):
        state = env.reset()
        action = agent.choose_action(state)  # Select initial action
        total_reward = 0
        done = False

        while not done:
            # Step in environment
            next_state, reward, done, _ = env.step(action)

            # Select next action (before update)
            next_action = agent.choose_action(next_state)

            # SARSA update (needs next action)
            agent.update(state, action, reward, next_state, next_action, done)

            state = next_state
            action = next_action
            total_reward += reward

        rewards_history.append(total_reward)

    return rewards_history

4. Q-Learning vs SARSA Comparison

4.1 Key Differences

Feature Q-Learning SARSA
Policy Type Off-policy On-policy
Target Calculation max Q(s', a') Q(s', a')
Learning Target Optimal policy Current policy
Exploration Impact No impact on learning Direct impact on learning
Safety More aggressive More conservative

4.2 Cliff Walking Example

[S][.][.][.][.][.][.][.][.][.][.][G]
[C][C][C][C][C][C][C][C][C][C][C][C]

S: Start, G: Goal, C: Cliff (large negative reward)
def cliff_walking_comparison():
    """
    Q-Learning: Prefers shortest path along cliff edge (risky but fast)
    SARSA: Prefers safe path away from cliff (slow but safe)
    """
    # Q-Learning learns optimal path but
    # may fall off cliff during ε-greedy exploration

    # SARSA considers exploration and
    # learns path away from cliff
    pass

4.3 Visualization Comparison

import matplotlib.pyplot as plt

def compare_algorithms(env, n_episodes=500, n_runs=10):
    q_rewards = np.zeros((n_runs, n_episodes))
    sarsa_rewards = np.zeros((n_runs, n_episodes))

    for run in range(n_runs):
        q_agent = QLearning(env.n_states, env.n_actions)
        sarsa_agent = SARSA(env.n_states, env.n_actions)

        q_rewards[run] = train_qlearning(env, q_agent, n_episodes)
        sarsa_rewards[run] = train_sarsa(env, sarsa_agent, n_episodes)

    # Mean and standard deviation
    plt.figure(figsize=(10, 6))

    q_mean = q_rewards.mean(axis=0)
    sarsa_mean = sarsa_rewards.mean(axis=0)

    plt.plot(q_mean, label='Q-Learning', alpha=0.8)
    plt.plot(sarsa_mean, label='SARSA', alpha=0.8)

    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.title('Q-Learning vs SARSA')
    plt.legend()
    plt.show()

5. Exploration Strategies

5.1 Epsilon-Greedy

def epsilon_greedy(q_values, epsilon):
    if np.random.random() < epsilon:
        return np.random.randint(len(q_values))
    return np.argmax(q_values)

# Epsilon decay schedule
def get_epsilon(episode, min_eps=0.01, max_eps=1.0, decay=0.995):
    return max(min_eps, max_eps * (decay ** episode))

5.2 Softmax (Boltzmann) Exploration

def softmax_action(q_values, temperature=1.0):
    """Probabilistic action selection by temperature"""
    exp_q = np.exp(q_values / temperature)
    probs = exp_q / np.sum(exp_q)
    return np.random.choice(len(q_values), p=probs)

5.3 UCB (Upper Confidence Bound)

class UCBAgent:
    def __init__(self, n_states, n_actions, c=2.0):
        self.q_table = np.zeros((n_states, n_actions))
        self.n_visits = np.zeros((n_states, n_actions))
        self.total_visits = np.zeros(n_states)
        self.c = c

    def choose_action(self, state):
        self.total_visits[state] += 1

        # Select unvisited actions first
        if 0 in self.n_visits[state]:
            return np.argmin(self.n_visits[state])

        # Calculate UCB values
        ucb_values = self.q_table[state] + self.c * np.sqrt(
            np.log(self.total_visits[state]) / self.n_visits[state]
        )
        return np.argmax(ucb_values)

6. Expected SARSA

6.1 Concept

A middle form between SARSA and Q-Learning, using expected value for next state.

Q(s, a) ← Q(s, a) + α[r + γ Σ_a' π(a'|s') Q(s', a') - Q(s, a)]
class ExpectedSARSA:
    def __init__(self, n_states, n_actions, alpha=0.1, gamma=0.99, epsilon=0.1):
        self.q_table = np.zeros((n_states, n_actions))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.n_actions = n_actions

    def get_policy_probs(self, state):
        """Probability distribution of ε-greedy policy"""
        probs = np.ones(self.n_actions) * self.epsilon / self.n_actions
        best_action = np.argmax(self.q_table[state])
        probs[best_action] += 1 - self.epsilon
        return probs

    def update(self, state, action, reward, next_state, done):
        if done:
            target = reward
        else:
            # Calculate expected value
            probs = self.get_policy_probs(next_state)
            expected_q = np.sum(probs * self.q_table[next_state])
            target = reward + self.gamma * expected_q

        td_error = target - self.q_table[state, action]
        self.q_table[state, action] += self.alpha * td_error

7. Convergence and Hyperparameters

7.1 Convergence Conditions

Conditions for Q-Learning to converge to optimal Q*:

  1. Visit all state-action pairs infinitely
  2. Learning rate conditions: Σ α = ∞, Σ α² < ∞ (e.g., α = 1/n)
  3. Bounded rewards

7.2 Hyperparameter Tuning

# General starting point
config = {
    'alpha': 0.1,        # Learning rate: 0.01 ~ 0.5
    'gamma': 0.99,       # Discount factor: 0.9 ~ 0.999
    'epsilon': 1.0,      # Initial exploration rate
    'epsilon_min': 0.01, # Minimum exploration rate
    'epsilon_decay': 0.995  # Decay rate
}

# Learning rate scheduling
def learning_rate_schedule(episode, initial_lr=0.5, decay=0.001):
    return initial_lr / (1 + decay * episode)

8. Practice: FrozenLake

import gymnasium as gym

def train_frozen_lake():
    env = gym.make('FrozenLake-v1', is_slippery=True)

    agent = QLearning(
        n_states=env.observation_space.n,
        n_actions=env.action_space.n,
        alpha=0.1,
        gamma=0.99,
        epsilon=1.0
    )

    n_episodes = 10000
    rewards = []

    for episode in range(n_episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False

        while not done:
            action = agent.choose_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            agent.update(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

        rewards.append(total_reward)
        agent.epsilon = max(0.01, agent.epsilon * 0.9995)

        if (episode + 1) % 1000 == 0:
            avg_reward = np.mean(rewards[-100:])
            print(f"Episode {episode + 1}, Avg Reward: {avg_reward:.3f}")

    return agent, rewards

if __name__ == "__main__":
    agent, rewards = train_frozen_lake()

Summary

Algorithm Target Policy Characteristics
Q-Learning max Q(s',a') Off-policy Learn optimal policy
SARSA Q(s',a') On-policy Evaluate current policy
Expected SARSA E[Q(s',a')] Off-policy Lower variance

Key Points: - Q-Learning directly learns optimal policy - SARSA learns safely considering exploration - Proper exploration-exploitation balance is crucial


Next Steps


References

  • Sutton & Barto, "Reinforcement Learning: An Introduction", Chapter 6
  • David Silver's RL Course, Lecture 5: Model-Free Control
  • Gymnasium FrozenLake
to navigate between lessons