11_multi_agent.py

Download
python 417 lines 12.8 KB
  1"""
  2Multi-Agent RL: IQLκ³Ό κ°„λ‹¨ν•œ ν˜‘λ ₯/경쟁 ν™˜κ²½
  3닀쀑 μ—μ΄μ „νŠΈ κ°•ν™”ν•™μŠ΅μ˜ κΈ°λ³Έ κ°œλ… κ΅¬ν˜„
  4"""
  5import torch
  6import torch.nn as nn
  7import torch.nn.functional as F
  8import numpy as np
  9import matplotlib.pyplot as plt
 10
 11
 12class IQLAgent:
 13    """Independent Q-Learning μ—μ΄μ „νŠΈ"""
 14
 15    def __init__(self, obs_dim, action_dim, lr=1e-3, gamma=0.99, epsilon=0.1):
 16        self.q_network = nn.Sequential(
 17            nn.Linear(obs_dim, 64),
 18            nn.ReLU(),
 19            nn.Linear(64, 64),
 20            nn.ReLU(),
 21            nn.Linear(64, action_dim)
 22        )
 23        self.optimizer = torch.optim.Adam(self.q_network.parameters(), lr=lr)
 24        self.gamma = gamma
 25        self.epsilon = epsilon
 26        self.action_dim = action_dim
 27
 28    def choose_action(self, obs):
 29        """Epsilon-greedy 행동 선택"""
 30        if np.random.random() < self.epsilon:
 31            return np.random.randint(self.action_dim)
 32        with torch.no_grad():
 33            q_values = self.q_network(torch.FloatTensor(obs))
 34            return q_values.argmax().item()
 35
 36    def update(self, obs, action, reward, next_obs, done):
 37        """Q-learning μ—…λ°μ΄νŠΈ"""
 38        obs_tensor = torch.FloatTensor(obs)
 39        next_obs_tensor = torch.FloatTensor(next_obs)
 40
 41        current_q = self.q_network(obs_tensor)[action]
 42
 43        with torch.no_grad():
 44            if done:
 45                target_q = reward
 46            else:
 47                target_q = reward + self.gamma * self.q_network(next_obs_tensor).max()
 48
 49        loss = (current_q - target_q) ** 2
 50
 51        self.optimizer.zero_grad()
 52        loss.backward()
 53        self.optimizer.step()
 54
 55        return loss.item()
 56
 57
 58class SimpleGridWorld:
 59    """
 60    κ°„λ‹¨ν•œ 2-μ—μ΄μ „νŠΈ κ·Έλ¦¬λ“œ ν™˜κ²½
 61    - κ·Έλ¦¬λ“œ: 5x5
 62    - λͺ©ν‘œ: 두 μ—μ΄μ „νŠΈκ°€ 각자의 λͺ©ν‘œ 지점에 도달
 63    - ν˜‘λ ₯ μš”μ†Œ: 같은 셀에 있으면 λ³΄λ„ˆμŠ€ 보상
 64    """
 65
 66    def __init__(self, grid_size=5):
 67        self.grid_size = grid_size
 68        self.n_agents = 2
 69
 70        # 행동: 상, ν•˜, 쒌, 우, λŒ€κΈ°
 71        self.action_dim = 5
 72        self.obs_dim = 4  # (x, y, goal_x, goal_y)
 73
 74        self.reset()
 75
 76    def reset(self):
 77        """ν™˜κ²½ μ΄ˆκΈ°ν™”"""
 78        # μ—μ΄μ „νŠΈ 초기 μœ„μΉ˜ (λ¬΄μž‘μœ„)
 79        self.agent_pos = [
 80            [np.random.randint(self.grid_size), np.random.randint(self.grid_size)]
 81            for _ in range(self.n_agents)
 82        ]
 83
 84        # λͺ©ν‘œ μœ„μΉ˜ (κ³ μ •)
 85        self.goals = [
 86            [0, self.grid_size - 1],  # μ—μ΄μ „νŠΈ 0의 λͺ©ν‘œ
 87            [self.grid_size - 1, 0]   # μ—μ΄μ „νŠΈ 1의 λͺ©ν‘œ
 88        ]
 89
 90        self.steps = 0
 91        return self.get_observations()
 92
 93    def get_observations(self):
 94        """각 μ—μ΄μ „νŠΈμ˜ κ΄€μΈ‘ λ°˜ν™˜"""
 95        observations = []
 96        for i in range(self.n_agents):
 97            obs = [
 98                self.agent_pos[i][0] / self.grid_size,
 99                self.agent_pos[i][1] / self.grid_size,
100                self.goals[i][0] / self.grid_size,
101                self.goals[i][1] / self.grid_size
102            ]
103            observations.append(np.array(obs, dtype=np.float32))
104        return observations
105
106    def step(self, actions):
107        """ν™˜κ²½ μŠ€ν…"""
108        self.steps += 1
109        rewards = [0.0, 0.0]
110
111        # 각 μ—μ΄μ „νŠΈ 이동
112        for i, action in enumerate(actions):
113            x, y = self.agent_pos[i]
114
115            # 행동 적용: 상(0), ν•˜(1), 쒌(2), 우(3), λŒ€κΈ°(4)
116            if action == 0:  # 상
117                x = max(0, x - 1)
118            elif action == 1:  # ν•˜
119                x = min(self.grid_size - 1, x + 1)
120            elif action == 2:  # 쒌
121                y = max(0, y - 1)
122            elif action == 3:  # 우
123                y = min(self.grid_size - 1, y + 1)
124            # action == 4: λŒ€κΈ°
125
126            self.agent_pos[i] = [x, y]
127
128            # λͺ©ν‘œ 도달 보상
129            if self.agent_pos[i] == self.goals[i]:
130                rewards[i] += 10.0
131
132            # λ§€ μŠ€ν…λ§ˆλ‹€ μž‘μ€ νŽ˜λ„ν‹°
133            rewards[i] -= 0.01
134
135        # ν˜‘λ ₯ λ³΄λ„ˆμŠ€: 같은 셀에 있으면
136        if self.agent_pos[0] == self.agent_pos[1]:
137            rewards[0] += 1.0
138            rewards[1] += 1.0
139
140        # μ’…λ£Œ 쑰건: λ‘˜ λ‹€ λͺ©ν‘œ 도달 λ˜λŠ” μ΅œλŒ€ μŠ€ν… 도달
141        done = (
142            (self.agent_pos[0] == self.goals[0] and self.agent_pos[1] == self.goals[1])
143            or self.steps >= 50
144        )
145
146        observations = self.get_observations()
147        return observations, rewards, done
148
149
150class CompetitiveGridWorld:
151    """
152    경쟁 ν™˜κ²½: 두 μ—μ΄μ „νŠΈκ°€ ν•˜λ‚˜μ˜ 보상을 두고 경쟁
153    λ¨Όμ € λ„λ‹¬ν•œ μ—μ΄μ „νŠΈκ°€ 보상을 가져감
154    """
155
156    def __init__(self, grid_size=5):
157        self.grid_size = grid_size
158        self.n_agents = 2
159        self.action_dim = 5
160        self.obs_dim = 4
161
162        self.reset()
163
164    def reset(self):
165        """ν™˜κ²½ μ΄ˆκΈ°ν™”"""
166        # μ—μ΄μ „νŠΈ 초기 μœ„μΉ˜
167        self.agent_pos = [
168            [0, 0],
169            [self.grid_size - 1, self.grid_size - 1]
170        ]
171
172        # 곡톡 λͺ©ν‘œ μœ„μΉ˜ (쀑앙)
173        self.goal = [self.grid_size // 2, self.grid_size // 2]
174        self.goal_taken = False
175        self.steps = 0
176
177        return self.get_observations()
178
179    def get_observations(self):
180        """각 μ—μ΄μ „νŠΈμ˜ κ΄€μΈ‘ λ°˜ν™˜"""
181        observations = []
182        for i in range(self.n_agents):
183            obs = [
184                self.agent_pos[i][0] / self.grid_size,
185                self.agent_pos[i][1] / self.grid_size,
186                self.goal[0] / self.grid_size,
187                self.goal[1] / self.grid_size
188            ]
189            observations.append(np.array(obs, dtype=np.float32))
190        return observations
191
192    def step(self, actions):
193        """ν™˜κ²½ μŠ€ν…"""
194        self.steps += 1
195        rewards = [0.0, 0.0]
196
197        # 각 μ—μ΄μ „νŠΈ 이동
198        for i, action in enumerate(actions):
199            x, y = self.agent_pos[i]
200
201            if action == 0:  # 상
202                x = max(0, x - 1)
203            elif action == 1:  # ν•˜
204                x = min(self.grid_size - 1, x + 1)
205            elif action == 2:  # 쒌
206                y = max(0, y - 1)
207            elif action == 3:  # 우
208                y = min(self.grid_size - 1, y + 1)
209
210            self.agent_pos[i] = [x, y]
211
212            # λͺ©ν‘œ 도달 체크 (λ¨Όμ € λ„λ‹¬ν•œ μ—μ΄μ „νŠΈλ§Œ 보상)
213            if not self.goal_taken and self.agent_pos[i] == self.goal:
214                rewards[i] += 10.0
215                self.goal_taken = True
216
217            # λ§€ μŠ€ν…λ§ˆλ‹€ μž‘μ€ νŽ˜λ„ν‹°
218            rewards[i] -= 0.01
219
220        # μ’…λ£Œ 쑰건
221        done = self.goal_taken or self.steps >= 50
222
223        observations = self.get_observations()
224        return observations, rewards, done
225
226
227class IQLSystem:
228    """닀쀑 μ—μ΄μ „νŠΈ IQL μ‹œμŠ€ν…œ"""
229
230    def __init__(self, n_agents, obs_dim, action_dim):
231        self.agents = [
232            IQLAgent(obs_dim, action_dim)
233            for _ in range(n_agents)
234        ]
235        self.n_agents = n_agents
236
237    def choose_actions(self, observations):
238        """λͺ¨λ“  μ—μ΄μ „νŠΈμ˜ 행동 선택"""
239        return [
240            agent.choose_action(obs)
241            for agent, obs in zip(self.agents, observations)
242        ]
243
244    def update(self, observations, actions, rewards, next_observations, done):
245        """λͺ¨λ“  μ—μ΄μ „νŠΈ μ—…λ°μ΄νŠΈ"""
246        losses = []
247        for i, agent in enumerate(self.agents):
248            loss = agent.update(
249                observations[i], actions[i],
250                rewards[i], next_observations[i], done
251            )
252            losses.append(loss)
253        return losses
254
255
256def train_cooperative():
257    """ν˜‘λ ₯ ν™˜κ²½μ—μ„œ IQL ν•™μŠ΅"""
258    print("=== ν˜‘λ ₯ ν™˜κ²½ ν•™μŠ΅ ===\n")
259
260    env = SimpleGridWorld(grid_size=5)
261    system = IQLSystem(
262        n_agents=env.n_agents,
263        obs_dim=env.obs_dim,
264        action_dim=env.action_dim
265    )
266
267    n_episodes = 1000
268    episode_rewards = []
269
270    for episode in range(n_episodes):
271        observations = env.reset()
272        total_rewards = [0.0, 0.0]
273        done = False
274
275        while not done:
276            actions = system.choose_actions(observations)
277            next_observations, rewards, done = env.step(actions)
278
279            system.update(observations, actions, rewards, next_observations, done)
280
281            observations = next_observations
282            total_rewards[0] += rewards[0]
283            total_rewards[1] += rewards[1]
284
285        episode_rewards.append(sum(total_rewards) / 2)
286
287        # Epsilon κ°μ†Œ
288        for agent in system.agents:
289            agent.epsilon = max(0.01, agent.epsilon * 0.995)
290
291        if (episode + 1) % 100 == 0:
292            avg_reward = np.mean(episode_rewards[-100:])
293            print(f"Episode {episode + 1}, Avg Reward: {avg_reward:.2f}")
294
295    return episode_rewards
296
297
298def train_competitive():
299    """경쟁 ν™˜κ²½μ—μ„œ IQL ν•™μŠ΅"""
300    print("\n=== 경쟁 ν™˜κ²½ ν•™μŠ΅ ===\n")
301
302    env = CompetitiveGridWorld(grid_size=5)
303    system = IQLSystem(
304        n_agents=env.n_agents,
305        obs_dim=env.obs_dim,
306        action_dim=env.action_dim
307    )
308
309    n_episodes = 1000
310    agent0_wins = []
311    agent1_wins = []
312
313    for episode in range(n_episodes):
314        observations = env.reset()
315        episode_rewards = [0.0, 0.0]
316        done = False
317
318        while not done:
319            actions = system.choose_actions(observations)
320            next_observations, rewards, done = env.step(actions)
321
322            system.update(observations, actions, rewards, next_observations, done)
323
324            observations = next_observations
325            episode_rewards[0] += rewards[0]
326            episode_rewards[1] += rewards[1]
327
328        # 승자 기둝
329        agent0_wins.append(1 if episode_rewards[0] > episode_rewards[1] else 0)
330        agent1_wins.append(1 if episode_rewards[1] > episode_rewards[0] else 0)
331
332        # Epsilon κ°μ†Œ
333        for agent in system.agents:
334            agent.epsilon = max(0.01, agent.epsilon * 0.995)
335
336        if (episode + 1) % 100 == 0:
337            win_rate_0 = np.mean(agent0_wins[-100:]) * 100
338            win_rate_1 = np.mean(agent1_wins[-100:]) * 100
339            print(f"Episode {episode + 1}, Win Rate - Agent0: {win_rate_0:.1f}%, Agent1: {win_rate_1:.1f}%")
340
341    return agent0_wins, agent1_wins
342
343
344def visualize_results(coop_rewards, comp_wins):
345    """ν•™μŠ΅ κ²°κ³Ό μ‹œκ°ν™”"""
346    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
347
348    # ν˜‘λ ₯ ν™˜κ²½ 보상
349    axes[0].plot(coop_rewards, alpha=0.3, color='blue')
350    window = 50
351    smoothed = np.convolve(coop_rewards, np.ones(window)/window, mode='valid')
352    axes[0].plot(smoothed, color='blue', linewidth=2)
353    axes[0].set_title('ν˜‘λ ₯ ν™˜κ²½: 평균 보상')
354    axes[0].set_xlabel('Episode')
355    axes[0].set_ylabel('Average Reward')
356    axes[0].grid(True, alpha=0.3)
357
358    # 경쟁 ν™˜κ²½ 승λ₯ 
359    agent0_wins, agent1_wins = comp_wins
360    window = 50
361    win_rate_0 = np.convolve(agent0_wins, np.ones(window)/window, mode='valid') * 100
362    win_rate_1 = np.convolve(agent1_wins, np.ones(window)/window, mode='valid') * 100
363
364    axes[1].plot(win_rate_0, label='Agent 0', linewidth=2)
365    axes[1].plot(win_rate_1, label='Agent 1', linewidth=2)
366    axes[1].axhline(y=50, color='r', linestyle='--', alpha=0.3, label='κ· ν˜•μ ')
367    axes[1].set_title('경쟁 ν™˜κ²½: 승λ₯ ')
368    axes[1].set_xlabel('Episode')
369    axes[1].set_ylabel('Win Rate (%)')
370    axes[1].legend()
371    axes[1].grid(True, alpha=0.3)
372
373    plt.tight_layout()
374    plt.savefig('multi_agent_results.png', dpi=100, bbox_inches='tight')
375    print("\nκ·Έλž˜ν”„ μ €μž₯: multi_agent_results.png")
376
377
378def demonstrate_ctde_concept():
379    """
380    CTDE (Centralized Training, Decentralized Execution) κ°œλ… μ„€λͺ…
381    ν›ˆλ ¨ μ‹œμ—λŠ” κΈ€λ‘œλ²Œ 정보λ₯Ό μ‚¬μš©ν•˜μ§€λ§Œ, μ‹€ν–‰ μ‹œμ—λŠ” 둜컬 κ΄€μΈ‘λ§Œ μ‚¬μš©
382    """
383    print("\n=== CTDE νŒ¨λŸ¬λ‹€μž„ κ°œλ… ===\n")
384    print("ν›ˆλ ¨ 단계:")
385    print("  - Critic: λͺ¨λ“  μ—μ΄μ „νŠΈμ˜ κ΄€μΈ‘ + 행동에 μ ‘κ·Ό κ°€λŠ₯")
386    print("  - κΈ€λ‘œλ²Œ μƒνƒœλ‘œ κ°€μΉ˜ ν•¨μˆ˜ ν•™μŠ΅")
387    print("\nμ‹€ν–‰ 단계:")
388    print("  - Actor: 둜컬 κ΄€μΈ‘λ§Œ μ‚¬μš©")
389    print("  - λΆ„μ‚° μ‹€ν–‰μœΌλ‘œ 톡신 λΆˆν•„μš”")
390    print("\nμž₯점:")
391    print("  - ν•™μŠ΅ μ‹œ ν˜‘λ ₯ νŒ¨ν„΄ 발견 용이")
392    print("  - μ‹€ν–‰ μ‹œ ν™•μž₯μ„± μ’‹μŒ")
393    print("  - λΆ€λΆ„ κ΄€μΈ‘ ν™˜κ²½μ—μ„œλ„ μž‘λ™")
394
395
396if __name__ == "__main__":
397    print("닀쀑 μ—μ΄μ „νŠΈ κ°•ν™”ν•™μŠ΅ 예제\n")
398
399    # ν˜‘λ ₯ ν™˜κ²½ ν•™μŠ΅
400    coop_rewards = train_cooperative()
401
402    # 경쟁 ν™˜κ²½ ν•™μŠ΅
403    agent0_wins, agent1_wins = train_competitive()
404
405    # CTDE κ°œλ… μ„€λͺ…
406    demonstrate_ctde_concept()
407
408    # κ²°κ³Ό μ‹œκ°ν™”
409    visualize_results(coop_rewards, (agent0_wins, agent1_wins))
410
411    print("\nν•™μŠ΅ μ™„λ£Œ!")
412    print("\nμ£Όμš” κ°œλ…:")
413    print("1. IQL: 각 μ—μ΄μ „νŠΈκ°€ λ…λ¦½μ μœΌλ‘œ Q-learning")
414    print("2. 비정상성: λ‹€λ₯Έ μ—μ΄μ „νŠΈμ˜ μ •μ±… λ³€ν™”λ‘œ ν™˜κ²½μ΄ 동적")
415    print("3. ν˜‘λ ₯ vs 경쟁: 보상 ꡬ쑰에 λ”°λ₯Έ ν•™μŠ΅ 양상 차이")
416    print("4. CTDE: 쀑앙집쀑 ν•™μŠ΅, λΆ„μ‚° μ‹€ν–‰ νŒ¨λŸ¬λ‹€μž„")