11_multi_agent.py

  1"""
  2Multi-Agent RL: IQL과 간단한 협력/경쟁 환경
  3다중 에이전트 강화학습의 기본 개념 구현
  4"""
  5import torch
  6import torch.nn as nn
  7import torch.nn.functional as F
  8import numpy as np
  9import matplotlib.pyplot as plt
 10
 11
 12class IQLAgent:
 13    """Independent Q-Learning 에이전트"""
 14
 15    def __init__(self, obs_dim, action_dim, lr=1e-3, gamma=0.99, epsilon=0.1):
 16        self.q_network = nn.Sequential(
 17            nn.Linear(obs_dim, 64),
 18            nn.ReLU(),
 19            nn.Linear(64, 64),
 20            nn.ReLU(),
 21            nn.Linear(64, action_dim)
 22        )
 23        self.optimizer = torch.optim.Adam(self.q_network.parameters(), lr=lr)
 24        self.gamma = gamma
 25        self.epsilon = epsilon
 26        self.action_dim = action_dim
 27
 28    def choose_action(self, obs):
 29        """Epsilon-greedy 행동 선택"""
 30        if np.random.random() < self.epsilon:
 31            return np.random.randint(self.action_dim)
 32        with torch.no_grad():
 33            q_values = self.q_network(torch.FloatTensor(obs))
 34            return q_values.argmax().item()
 35
 36    def update(self, obs, action, reward, next_obs, done):
 37        """Q-learning 업데이트"""
 38        obs_tensor = torch.FloatTensor(obs)
 39        next_obs_tensor = torch.FloatTensor(next_obs)
 40
 41        current_q = self.q_network(obs_tensor)[action]
 42
 43        with torch.no_grad():
 44            if done:
 45                target_q = reward
 46            else:
 47                target_q = reward + self.gamma * self.q_network(next_obs_tensor).max()
 48
 49        loss = (current_q - target_q) ** 2
 50
 51        self.optimizer.zero_grad()
 52        loss.backward()
 53        self.optimizer.step()
 54
 55        return loss.item()
 56
 57
 58class SimpleGridWorld:
 59    """
 60    간단한 2-에이전트 그리드 환경
 61    - 그리드: 5x5
 62    - 목표: 두 에이전트가 각자의 목표 지점에 도달
 63    - 협력 요소: 같은 셀에 있으면 보너스 보상
 64    """
 65
 66    def __init__(self, grid_size=5):
 67        self.grid_size = grid_size
 68        self.n_agents = 2
 69
 70        # 행동: 상, 하, 좌, 우, 대기
 71        self.action_dim = 5
 72        self.obs_dim = 4  # (x, y, goal_x, goal_y)
 73
 74        self.reset()
 75
 76    def reset(self):
 77        """환경 초기화"""
 78        # 에이전트 초기 위치 (무작위)
 79        self.agent_pos = [
 80            [np.random.randint(self.grid_size), np.random.randint(self.grid_size)]
 81            for _ in range(self.n_agents)
 82        ]
 83
 84        # 목표 위치 (고정)
 85        self.goals = [
 86            [0, self.grid_size - 1],  # 에이전트 0의 목표
 87            [self.grid_size - 1, 0]   # 에이전트 1의 목표
 88        ]
 89
 90        self.steps = 0
 91        return self.get_observations()
 92
 93    def get_observations(self):
 94        """각 에이전트의 관측 반환"""
 95        observations = []
 96        for i in range(self.n_agents):
 97            obs = [
 98                self.agent_pos[i][0] / self.grid_size,
 99                self.agent_pos[i][1] / self.grid_size,
100                self.goals[i][0] / self.grid_size,
101                self.goals[i][1] / self.grid_size
102            ]
103            observations.append(np.array(obs, dtype=np.float32))
104        return observations
105
106    def step(self, actions):
107        """환경 스텝"""
108        self.steps += 1
109        rewards = [0.0, 0.0]
110
111        # 각 에이전트 이동
112        for i, action in enumerate(actions):
113            x, y = self.agent_pos[i]
114
115            # 행동 적용: 상(0), 하(1), 좌(2), 우(3), 대기(4)
116            if action == 0:  # 상
117                x = max(0, x - 1)
118            elif action == 1:  # 하
119                x = min(self.grid_size - 1, x + 1)
120            elif action == 2:  # 좌
121                y = max(0, y - 1)
122            elif action == 3:  # 우
123                y = min(self.grid_size - 1, y + 1)
124            # action == 4: 대기
125
126            self.agent_pos[i] = [x, y]
127
128            # 목표 도달 보상
129            if self.agent_pos[i] == self.goals[i]:
130                rewards[i] += 10.0
131
132            # 매 스텝마다 작은 페널티
133            rewards[i] -= 0.01
134
135        # 협력 보너스: 같은 셀에 있으면
136        if self.agent_pos[0] == self.agent_pos[1]:
137            rewards[0] += 1.0
138            rewards[1] += 1.0
139
140        # 종료 조건: 둘 다 목표 도달 또는 최대 스텝 도달
141        done = (
142            (self.agent_pos[0] == self.goals[0] and self.agent_pos[1] == self.goals[1])
143            or self.steps >= 50
144        )
145
146        observations = self.get_observations()
147        return observations, rewards, done
148
149
150class CompetitiveGridWorld:
151    """
152    경쟁 환경: 두 에이전트가 하나의 보상을 두고 경쟁
153    먼저 도달한 에이전트가 보상을 가져감
154    """
155
156    def __init__(self, grid_size=5):
157        self.grid_size = grid_size
158        self.n_agents = 2
159        self.action_dim = 5
160        self.obs_dim = 4
161
162        self.reset()
163
164    def reset(self):
165        """환경 초기화"""
166        # 에이전트 초기 위치
167        self.agent_pos = [
168            [0, 0],
169            [self.grid_size - 1, self.grid_size - 1]
170        ]
171
172        # 공통 목표 위치 (중앙)
173        self.goal = [self.grid_size // 2, self.grid_size // 2]
174        self.goal_taken = False
175        self.steps = 0
176
177        return self.get_observations()
178
179    def get_observations(self):
180        """각 에이전트의 관측 반환"""
181        observations = []
182        for i in range(self.n_agents):
183            obs = [
184                self.agent_pos[i][0] / self.grid_size,
185                self.agent_pos[i][1] / self.grid_size,
186                self.goal[0] / self.grid_size,
187                self.goal[1] / self.grid_size
188            ]
189            observations.append(np.array(obs, dtype=np.float32))
190        return observations
191
192    def step(self, actions):
193        """환경 스텝"""
194        self.steps += 1
195        rewards = [0.0, 0.0]
196
197        # 각 에이전트 이동
198        for i, action in enumerate(actions):
199            x, y = self.agent_pos[i]
200
201            if action == 0:  # 상
202                x = max(0, x - 1)
203            elif action == 1:  # 하
204                x = min(self.grid_size - 1, x + 1)
205            elif action == 2:  # 좌
206                y = max(0, y - 1)
207            elif action == 3:  # 우
208                y = min(self.grid_size - 1, y + 1)
209
210            self.agent_pos[i] = [x, y]
211
212            # 목표 도달 체크 (먼저 도달한 에이전트만 보상)
213            if not self.goal_taken and self.agent_pos[i] == self.goal:
214                rewards[i] += 10.0
215                self.goal_taken = True
216
217            # 매 스텝마다 작은 페널티
218            rewards[i] -= 0.01
219
220        # 종료 조건
221        done = self.goal_taken or self.steps >= 50
222
223        observations = self.get_observations()
224        return observations, rewards, done
225
226
227class IQLSystem:
228    """다중 에이전트 IQL 시스템"""
229
230    def __init__(self, n_agents, obs_dim, action_dim):
231        self.agents = [
232            IQLAgent(obs_dim, action_dim)
233            for _ in range(n_agents)
234        ]
235        self.n_agents = n_agents
236
237    def choose_actions(self, observations):
238        """모든 에이전트의 행동 선택"""
239        return [
240            agent.choose_action(obs)
241            for agent, obs in zip(self.agents, observations)
242        ]
243
244    def update(self, observations, actions, rewards, next_observations, done):
245        """모든 에이전트 업데이트"""
246        losses = []
247        for i, agent in enumerate(self.agents):
248            loss = agent.update(
249                observations[i], actions[i],
250                rewards[i], next_observations[i], done
251            )
252            losses.append(loss)
253        return losses
254
255
256def train_cooperative():
257    """협력 환경에서 IQL 학습"""
258    print("=== 협력 환경 학습 ===\n")
259
260    env = SimpleGridWorld(grid_size=5)
261    system = IQLSystem(
262        n_agents=env.n_agents,
263        obs_dim=env.obs_dim,
264        action_dim=env.action_dim
265    )
266
267    n_episodes = 1000
268    episode_rewards = []
269
270    for episode in range(n_episodes):
271        observations = env.reset()
272        total_rewards = [0.0, 0.0]
273        done = False
274
275        while not done:
276            actions = system.choose_actions(observations)
277            next_observations, rewards, done = env.step(actions)
278
279            system.update(observations, actions, rewards, next_observations, done)
280
281            observations = next_observations
282            total_rewards[0] += rewards[0]
283            total_rewards[1] += rewards[1]
284
285        episode_rewards.append(sum(total_rewards) / 2)
286
287        # Epsilon 감소
288        for agent in system.agents:
289            agent.epsilon = max(0.01, agent.epsilon * 0.995)
290
291        if (episode + 1) % 100 == 0:
292            avg_reward = np.mean(episode_rewards[-100:])
293            print(f"Episode {episode + 1}, Avg Reward: {avg_reward:.2f}")
294
295    return episode_rewards
296
297
298def train_competitive():
299    """경쟁 환경에서 IQL 학습"""
300    print("\n=== 경쟁 환경 학습 ===\n")
301
302    env = CompetitiveGridWorld(grid_size=5)
303    system = IQLSystem(
304        n_agents=env.n_agents,
305        obs_dim=env.obs_dim,
306        action_dim=env.action_dim
307    )
308
309    n_episodes = 1000
310    agent0_wins = []
311    agent1_wins = []
312
313    for episode in range(n_episodes):
314        observations = env.reset()
315        episode_rewards = [0.0, 0.0]
316        done = False
317
318        while not done:
319            actions = system.choose_actions(observations)
320            next_observations, rewards, done = env.step(actions)
321
322            system.update(observations, actions, rewards, next_observations, done)
323
324            observations = next_observations
325            episode_rewards[0] += rewards[0]
326            episode_rewards[1] += rewards[1]
327
328        # 승자 기록
329        agent0_wins.append(1 if episode_rewards[0] > episode_rewards[1] else 0)
330        agent1_wins.append(1 if episode_rewards[1] > episode_rewards[0] else 0)
331
332        # Epsilon 감소
333        for agent in system.agents:
334            agent.epsilon = max(0.01, agent.epsilon * 0.995)
335
336        if (episode + 1) % 100 == 0:
337            win_rate_0 = np.mean(agent0_wins[-100:]) * 100
338            win_rate_1 = np.mean(agent1_wins[-100:]) * 100
339            print(f"Episode {episode + 1}, Win Rate - Agent0: {win_rate_0:.1f}%, Agent1: {win_rate_1:.1f}%")
340
341    return agent0_wins, agent1_wins
342
343
344def visualize_results(coop_rewards, comp_wins):
345    """학습 결과 시각화"""
346    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
347
348    # 협력 환경 보상
349    axes[0].plot(coop_rewards, alpha=0.3, color='blue')
350    window = 50
351    smoothed = np.convolve(coop_rewards, np.ones(window)/window, mode='valid')
352    axes[0].plot(smoothed, color='blue', linewidth=2)
353    axes[0].set_title('협력 환경: 평균 보상')
354    axes[0].set_xlabel('Episode')
355    axes[0].set_ylabel('Average Reward')
356    axes[0].grid(True, alpha=0.3)
357
358    # 경쟁 환경 승률
359    agent0_wins, agent1_wins = comp_wins
360    window = 50
361    win_rate_0 = np.convolve(agent0_wins, np.ones(window)/window, mode='valid') * 100
362    win_rate_1 = np.convolve(agent1_wins, np.ones(window)/window, mode='valid') * 100
363
364    axes[1].plot(win_rate_0, label='Agent 0', linewidth=2)
365    axes[1].plot(win_rate_1, label='Agent 1', linewidth=2)
366    axes[1].axhline(y=50, color='r', linestyle='--', alpha=0.3, label='균형점')
367    axes[1].set_title('경쟁 환경: 승률')
368    axes[1].set_xlabel('Episode')
369    axes[1].set_ylabel('Win Rate (%)')
370    axes[1].legend()
371    axes[1].grid(True, alpha=0.3)
372
373    plt.tight_layout()
374    plt.savefig('multi_agent_results.png', dpi=100, bbox_inches='tight')
375    print("\n그래프 저장: multi_agent_results.png")
376
377
378def demonstrate_ctde_concept():
379    """
380    CTDE (Centralized Training, Decentralized Execution) 개념 설명
381    훈련 시에는 글로벌 정보를 사용하지만, 실행 시에는 로컬 관측만 사용
382    """
383    print("\n=== CTDE 패러다임 개념 ===\n")
384    print("훈련 단계:")
385    print("  - Critic: 모든 에이전트의 관측 + 행동에 접근 가능")
386    print("  - 글로벌 상태로 가치 함수 학습")
387    print("\n실행 단계:")
388    print("  - Actor: 로컬 관측만 사용")
389    print("  - 분산 실행으로 통신 불필요")
390    print("\n장점:")
391    print("  - 학습 시 협력 패턴 발견 용이")
392    print("  - 실행 시 확장성 좋음")
393    print("  - 부분 관측 환경에서도 작동")
394
395
396if __name__ == "__main__":
397    print("다중 에이전트 강화학습 예제\n")
398
399    # 협력 환경 학습
400    coop_rewards = train_cooperative()
401
402    # 경쟁 환경 학습
403    agent0_wins, agent1_wins = train_competitive()
404
405    # CTDE 개념 설명
406    demonstrate_ctde_concept()
407
408    # 결과 시각화
409    visualize_results(coop_rewards, (agent0_wins, agent1_wins))
410
411    print("\n학습 완료!")
412    print("\n주요 개념:")
413    print("1. IQL: 각 에이전트가 독립적으로 Q-learning")
414    print("2. 비정상성: 다른 에이전트의 정책 변화로 환경이 동적")
415    print("3. 협력 vs 경쟁: 보상 구조에 따른 학습 양상 차이")
416    print("4. CTDE: 중앙집중 학습, 분산 실행 패러다임")