1"""
2Multi-Agent RL: IQLκ³Ό κ°λ¨ν νλ ₯/κ²½μ νκ²½
3λ€μ€ μμ΄μ νΈ κ°ννμ΅μ κΈ°λ³Έ κ°λ
ꡬν
4"""
5import torch
6import torch.nn as nn
7import torch.nn.functional as F
8import numpy as np
9import matplotlib.pyplot as plt
10
11
12class IQLAgent:
13 """Independent Q-Learning μμ΄μ νΈ"""
14
15 def __init__(self, obs_dim, action_dim, lr=1e-3, gamma=0.99, epsilon=0.1):
16 self.q_network = nn.Sequential(
17 nn.Linear(obs_dim, 64),
18 nn.ReLU(),
19 nn.Linear(64, 64),
20 nn.ReLU(),
21 nn.Linear(64, action_dim)
22 )
23 self.optimizer = torch.optim.Adam(self.q_network.parameters(), lr=lr)
24 self.gamma = gamma
25 self.epsilon = epsilon
26 self.action_dim = action_dim
27
28 def choose_action(self, obs):
29 """Epsilon-greedy νλ μ ν"""
30 if np.random.random() < self.epsilon:
31 return np.random.randint(self.action_dim)
32 with torch.no_grad():
33 q_values = self.q_network(torch.FloatTensor(obs))
34 return q_values.argmax().item()
35
36 def update(self, obs, action, reward, next_obs, done):
37 """Q-learning μ
λ°μ΄νΈ"""
38 obs_tensor = torch.FloatTensor(obs)
39 next_obs_tensor = torch.FloatTensor(next_obs)
40
41 current_q = self.q_network(obs_tensor)[action]
42
43 with torch.no_grad():
44 if done:
45 target_q = reward
46 else:
47 target_q = reward + self.gamma * self.q_network(next_obs_tensor).max()
48
49 loss = (current_q - target_q) ** 2
50
51 self.optimizer.zero_grad()
52 loss.backward()
53 self.optimizer.step()
54
55 return loss.item()
56
57
58class SimpleGridWorld:
59 """
60 κ°λ¨ν 2-μμ΄μ νΈ κ·Έλ¦¬λ νκ²½
61 - 그리λ: 5x5
62 - λͺ©ν: λ μμ΄μ νΈκ° κ°μμ λͺ©ν μ§μ μ λλ¬
63 - νλ ₯ μμ: κ°μ μ
μ μμΌλ©΄ 보λμ€ λ³΄μ
64 """
65
66 def __init__(self, grid_size=5):
67 self.grid_size = grid_size
68 self.n_agents = 2
69
70 # νλ: μ, ν, μ’, μ°, λκΈ°
71 self.action_dim = 5
72 self.obs_dim = 4 # (x, y, goal_x, goal_y)
73
74 self.reset()
75
76 def reset(self):
77 """νκ²½ μ΄κΈ°ν"""
78 # μμ΄μ νΈ μ΄κΈ° μμΉ (무μμ)
79 self.agent_pos = [
80 [np.random.randint(self.grid_size), np.random.randint(self.grid_size)]
81 for _ in range(self.n_agents)
82 ]
83
84 # λͺ©ν μμΉ (κ³ μ )
85 self.goals = [
86 [0, self.grid_size - 1], # μμ΄μ νΈ 0μ λͺ©ν
87 [self.grid_size - 1, 0] # μμ΄μ νΈ 1μ λͺ©ν
88 ]
89
90 self.steps = 0
91 return self.get_observations()
92
93 def get_observations(self):
94 """κ° μμ΄μ νΈμ κ΄μΈ‘ λ°ν"""
95 observations = []
96 for i in range(self.n_agents):
97 obs = [
98 self.agent_pos[i][0] / self.grid_size,
99 self.agent_pos[i][1] / self.grid_size,
100 self.goals[i][0] / self.grid_size,
101 self.goals[i][1] / self.grid_size
102 ]
103 observations.append(np.array(obs, dtype=np.float32))
104 return observations
105
106 def step(self, actions):
107 """νκ²½ μ€ν
"""
108 self.steps += 1
109 rewards = [0.0, 0.0]
110
111 # κ° μμ΄μ νΈ μ΄λ
112 for i, action in enumerate(actions):
113 x, y = self.agent_pos[i]
114
115 # νλ μ μ©: μ(0), ν(1), μ’(2), μ°(3), λκΈ°(4)
116 if action == 0: # μ
117 x = max(0, x - 1)
118 elif action == 1: # ν
119 x = min(self.grid_size - 1, x + 1)
120 elif action == 2: # μ’
121 y = max(0, y - 1)
122 elif action == 3: # μ°
123 y = min(self.grid_size - 1, y + 1)
124 # action == 4: λκΈ°
125
126 self.agent_pos[i] = [x, y]
127
128 # λͺ©ν λλ¬ λ³΄μ
129 if self.agent_pos[i] == self.goals[i]:
130 rewards[i] += 10.0
131
132 # λ§€ μ€ν
λ§λ€ μμ νλν°
133 rewards[i] -= 0.01
134
135 # νλ ₯ 보λμ€: κ°μ μ
μ μμΌλ©΄
136 if self.agent_pos[0] == self.agent_pos[1]:
137 rewards[0] += 1.0
138 rewards[1] += 1.0
139
140 # μ’
λ£ μ‘°κ±΄: λ λ€ λͺ©ν λλ¬ λλ μ΅λ μ€ν
λλ¬
141 done = (
142 (self.agent_pos[0] == self.goals[0] and self.agent_pos[1] == self.goals[1])
143 or self.steps >= 50
144 )
145
146 observations = self.get_observations()
147 return observations, rewards, done
148
149
150class CompetitiveGridWorld:
151 """
152 κ²½μ νκ²½: λ μμ΄μ νΈκ° νλμ 보μμ λκ³ κ²½μ
153 λ¨Όμ λλ¬ν μμ΄μ νΈκ° 보μμ κ°μ Έκ°
154 """
155
156 def __init__(self, grid_size=5):
157 self.grid_size = grid_size
158 self.n_agents = 2
159 self.action_dim = 5
160 self.obs_dim = 4
161
162 self.reset()
163
164 def reset(self):
165 """νκ²½ μ΄κΈ°ν"""
166 # μμ΄μ νΈ μ΄κΈ° μμΉ
167 self.agent_pos = [
168 [0, 0],
169 [self.grid_size - 1, self.grid_size - 1]
170 ]
171
172 # κ³΅ν΅ λͺ©ν μμΉ (μ€μ)
173 self.goal = [self.grid_size // 2, self.grid_size // 2]
174 self.goal_taken = False
175 self.steps = 0
176
177 return self.get_observations()
178
179 def get_observations(self):
180 """κ° μμ΄μ νΈμ κ΄μΈ‘ λ°ν"""
181 observations = []
182 for i in range(self.n_agents):
183 obs = [
184 self.agent_pos[i][0] / self.grid_size,
185 self.agent_pos[i][1] / self.grid_size,
186 self.goal[0] / self.grid_size,
187 self.goal[1] / self.grid_size
188 ]
189 observations.append(np.array(obs, dtype=np.float32))
190 return observations
191
192 def step(self, actions):
193 """νκ²½ μ€ν
"""
194 self.steps += 1
195 rewards = [0.0, 0.0]
196
197 # κ° μμ΄μ νΈ μ΄λ
198 for i, action in enumerate(actions):
199 x, y = self.agent_pos[i]
200
201 if action == 0: # μ
202 x = max(0, x - 1)
203 elif action == 1: # ν
204 x = min(self.grid_size - 1, x + 1)
205 elif action == 2: # μ’
206 y = max(0, y - 1)
207 elif action == 3: # μ°
208 y = min(self.grid_size - 1, y + 1)
209
210 self.agent_pos[i] = [x, y]
211
212 # λͺ©ν λλ¬ μ²΄ν¬ (λ¨Όμ λλ¬ν μμ΄μ νΈλ§ 보μ)
213 if not self.goal_taken and self.agent_pos[i] == self.goal:
214 rewards[i] += 10.0
215 self.goal_taken = True
216
217 # λ§€ μ€ν
λ§λ€ μμ νλν°
218 rewards[i] -= 0.01
219
220 # μ’
λ£ μ‘°κ±΄
221 done = self.goal_taken or self.steps >= 50
222
223 observations = self.get_observations()
224 return observations, rewards, done
225
226
227class IQLSystem:
228 """λ€μ€ μμ΄μ νΈ IQL μμ€ν
"""
229
230 def __init__(self, n_agents, obs_dim, action_dim):
231 self.agents = [
232 IQLAgent(obs_dim, action_dim)
233 for _ in range(n_agents)
234 ]
235 self.n_agents = n_agents
236
237 def choose_actions(self, observations):
238 """λͺ¨λ μμ΄μ νΈμ νλ μ ν"""
239 return [
240 agent.choose_action(obs)
241 for agent, obs in zip(self.agents, observations)
242 ]
243
244 def update(self, observations, actions, rewards, next_observations, done):
245 """λͺ¨λ μμ΄μ νΈ μ
λ°μ΄νΈ"""
246 losses = []
247 for i, agent in enumerate(self.agents):
248 loss = agent.update(
249 observations[i], actions[i],
250 rewards[i], next_observations[i], done
251 )
252 losses.append(loss)
253 return losses
254
255
256def train_cooperative():
257 """νλ ₯ νκ²½μμ IQL νμ΅"""
258 print("=== νλ ₯ νκ²½ νμ΅ ===\n")
259
260 env = SimpleGridWorld(grid_size=5)
261 system = IQLSystem(
262 n_agents=env.n_agents,
263 obs_dim=env.obs_dim,
264 action_dim=env.action_dim
265 )
266
267 n_episodes = 1000
268 episode_rewards = []
269
270 for episode in range(n_episodes):
271 observations = env.reset()
272 total_rewards = [0.0, 0.0]
273 done = False
274
275 while not done:
276 actions = system.choose_actions(observations)
277 next_observations, rewards, done = env.step(actions)
278
279 system.update(observations, actions, rewards, next_observations, done)
280
281 observations = next_observations
282 total_rewards[0] += rewards[0]
283 total_rewards[1] += rewards[1]
284
285 episode_rewards.append(sum(total_rewards) / 2)
286
287 # Epsilon κ°μ
288 for agent in system.agents:
289 agent.epsilon = max(0.01, agent.epsilon * 0.995)
290
291 if (episode + 1) % 100 == 0:
292 avg_reward = np.mean(episode_rewards[-100:])
293 print(f"Episode {episode + 1}, Avg Reward: {avg_reward:.2f}")
294
295 return episode_rewards
296
297
298def train_competitive():
299 """κ²½μ νκ²½μμ IQL νμ΅"""
300 print("\n=== κ²½μ νκ²½ νμ΅ ===\n")
301
302 env = CompetitiveGridWorld(grid_size=5)
303 system = IQLSystem(
304 n_agents=env.n_agents,
305 obs_dim=env.obs_dim,
306 action_dim=env.action_dim
307 )
308
309 n_episodes = 1000
310 agent0_wins = []
311 agent1_wins = []
312
313 for episode in range(n_episodes):
314 observations = env.reset()
315 episode_rewards = [0.0, 0.0]
316 done = False
317
318 while not done:
319 actions = system.choose_actions(observations)
320 next_observations, rewards, done = env.step(actions)
321
322 system.update(observations, actions, rewards, next_observations, done)
323
324 observations = next_observations
325 episode_rewards[0] += rewards[0]
326 episode_rewards[1] += rewards[1]
327
328 # μΉμ κΈ°λ‘
329 agent0_wins.append(1 if episode_rewards[0] > episode_rewards[1] else 0)
330 agent1_wins.append(1 if episode_rewards[1] > episode_rewards[0] else 0)
331
332 # Epsilon κ°μ
333 for agent in system.agents:
334 agent.epsilon = max(0.01, agent.epsilon * 0.995)
335
336 if (episode + 1) % 100 == 0:
337 win_rate_0 = np.mean(agent0_wins[-100:]) * 100
338 win_rate_1 = np.mean(agent1_wins[-100:]) * 100
339 print(f"Episode {episode + 1}, Win Rate - Agent0: {win_rate_0:.1f}%, Agent1: {win_rate_1:.1f}%")
340
341 return agent0_wins, agent1_wins
342
343
344def visualize_results(coop_rewards, comp_wins):
345 """νμ΅ κ²°κ³Ό μκ°ν"""
346 fig, axes = plt.subplots(1, 2, figsize=(14, 5))
347
348 # νλ ₯ νκ²½ 보μ
349 axes[0].plot(coop_rewards, alpha=0.3, color='blue')
350 window = 50
351 smoothed = np.convolve(coop_rewards, np.ones(window)/window, mode='valid')
352 axes[0].plot(smoothed, color='blue', linewidth=2)
353 axes[0].set_title('νλ ₯ νκ²½: νκ· λ³΄μ')
354 axes[0].set_xlabel('Episode')
355 axes[0].set_ylabel('Average Reward')
356 axes[0].grid(True, alpha=0.3)
357
358 # κ²½μ νκ²½ μΉλ₯
359 agent0_wins, agent1_wins = comp_wins
360 window = 50
361 win_rate_0 = np.convolve(agent0_wins, np.ones(window)/window, mode='valid') * 100
362 win_rate_1 = np.convolve(agent1_wins, np.ones(window)/window, mode='valid') * 100
363
364 axes[1].plot(win_rate_0, label='Agent 0', linewidth=2)
365 axes[1].plot(win_rate_1, label='Agent 1', linewidth=2)
366 axes[1].axhline(y=50, color='r', linestyle='--', alpha=0.3, label='κ· νμ ')
367 axes[1].set_title('κ²½μ νκ²½: μΉλ₯ ')
368 axes[1].set_xlabel('Episode')
369 axes[1].set_ylabel('Win Rate (%)')
370 axes[1].legend()
371 axes[1].grid(True, alpha=0.3)
372
373 plt.tight_layout()
374 plt.savefig('multi_agent_results.png', dpi=100, bbox_inches='tight')
375 print("\nκ·Έλν μ μ₯: multi_agent_results.png")
376
377
378def demonstrate_ctde_concept():
379 """
380 CTDE (Centralized Training, Decentralized Execution) κ°λ
μ€λͺ
381 νλ ¨ μμλ κΈλ‘λ² μ 보λ₯Ό μ¬μ©νμ§λ§, μ€ν μμλ λ‘컬 κ΄μΈ‘λ§ μ¬μ©
382 """
383 print("\n=== CTDE ν¨λ¬λ€μ κ°λ
===\n")
384 print("νλ ¨ λ¨κ³:")
385 print(" - Critic: λͺ¨λ μμ΄μ νΈμ κ΄μΈ‘ + νλμ μ κ·Ό κ°λ₯")
386 print(" - κΈλ‘λ² μνλ‘ κ°μΉ ν¨μ νμ΅")
387 print("\nμ€ν λ¨κ³:")
388 print(" - Actor: λ‘컬 κ΄μΈ‘λ§ μ¬μ©")
389 print(" - λΆμ° μ€νμΌλ‘ ν΅μ λΆνμ")
390 print("\nμ₯μ :")
391 print(" - νμ΅ μ νλ ₯ ν¨ν΄ λ°κ²¬ μ©μ΄")
392 print(" - μ€ν μ νμ₯μ± μ’μ")
393 print(" - λΆλΆ κ΄μΈ‘ νκ²½μμλ μλ")
394
395
396if __name__ == "__main__":
397 print("λ€μ€ μμ΄μ νΈ κ°ννμ΅ μμ \n")
398
399 # νλ ₯ νκ²½ νμ΅
400 coop_rewards = train_cooperative()
401
402 # κ²½μ νκ²½ νμ΅
403 agent0_wins, agent1_wins = train_competitive()
404
405 # CTDE κ°λ
μ€λͺ
406 demonstrate_ctde_concept()
407
408 # κ²°κ³Ό μκ°ν
409 visualize_results(coop_rewards, (agent0_wins, agent1_wins))
410
411 print("\nνμ΅ μλ£!")
412 print("\nμ£Όμ κ°λ
:")
413 print("1. IQL: κ° μμ΄μ νΈκ° λ
립μ μΌλ‘ Q-learning")
414 print("2. λΉμ μμ±: λ€λ₯Έ μμ΄μ νΈμ μ μ±
λ³νλ‘ νκ²½μ΄ λμ ")
415 print("3. νλ ₯ vs κ²½μ: 보μ ꡬ쑰μ λ°λ₯Έ νμ΅ μμ μ°¨μ΄")
416 print("4. CTDE: μ€μμ§μ€ νμ΅, λΆμ° μ€ν ν¨λ¬λ€μ")