fix(rjy): modify ising for mean field RL

nighood · nighood · commit ad8c53c0ab8a · 2024-03-15T15:23:31.000+08:00
diff --git a/dizoo/ising_env/config/ising_mfq_config.py b/dizoo/ising_env/config/ising_mfq_config.py
@@ -0,0 +1,65 @@
+from easydict import EasyDict
+
+obs_shape = 4
+action_shape = 2
+num_agents = 100
+dim_spin = 2
+agent_view_sight = 1
+
+ising_mfq_config = dict(
+    exp_name='ising_mfq_seed0',
+    env=dict(
+        collector_env_num=8,
+        evaluator_env_num=8,
+        n_evaluator_episode=8,
+        stop_value=20,
+        num_agents=num_agents,
+        dim_spin=dim_spin,
+        agent_view_sight=agent_view_sight,
+    ),
+    policy=dict(
+        cuda=True,
+        priority=False,
+        model=dict(
+            obs_shape=obs_shape + action_shape,  # for we will concat the pre_action_prob into obs
+            action_shape=action_shape,
+            encoder_hidden_size_list=[128, 128, 512],
+        ),
+        nstep=3,
+        discount_factor=0.99,
+        learn=dict(
+            update_per_collect=10,
+            batch_size=32,
+            learning_rate=0.0001,
+            target_update_freq=500,
+        ),
+        collect=dict(n_sample=96, ),
+        eval=dict(evaluator=dict(eval_freq=4000, )),
+        other=dict(
+            eps=dict(
+                type='exp',
+                start=1.,
+                end=0.05,
+                decay=250000,
+            ),
+            replay_buffer=dict(replay_buffer_size=100000, ),
+        ),
+    ),
+)
+ising_mfq_config = EasyDict(ising_mfq_config)
+main_config = ising_mfq_config
+ising_mfq_create_config = dict(
+    env=dict(
+        type='ising_model',
+        import_names=['dizoo.ising_env.envs.ising_model_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(type='dqn'),
+)
+ising_mfq_create_config = EasyDict(ising_mfq_create_config)
+create_config = ising_mfq_create_config
+
+if __name__ == '__main__':
+    # or you can enter `ding -m serial -c ising_mfq_config.py -s 0`
+    from ding.entry import serial_pipeline
+    serial_pipeline((main_config, create_config), seed=0)
diff --git a/dizoo/ising_env/envs/ising_model_env.py b/dizoo/ising_env/envs/ising_model_env.py
@@ -28,6 +28,20 @@ def __init__(self, cfg: dict) -> None:
         self._observation_space = gym.spaces.MultiBinary(4 * cfg.agent_view_sight)
         self._reward_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(1, ), dtype=np.float32)
 
+    def calculate_action_prob(self, actions):
+        num_action = self._action_space.n
+        N = actions.shape[0]
+        # Convert actions to one_hot encoding
+        one_hot_actions = np.eye(num_action)[actions.flatten()]
+        action_prob = np.zeros((N, num_action))
+
+        for i in range(N):
+            # Exclude agent i's actions and calculate the one_hot average of all other agent actions
+            exclude_current = np.delete(one_hot_actions, i, axis=0)
+            action_prob[i] = exclude_current.mean(axis=0)
+
+        return action_prob
+
     def reset(self) -> np.ndarray:
         if hasattr(self, '_seed') and hasattr(self, '_dynamic_seed') and self._dynamic_seed:
             np_seed = 100 * np.random.randint(1, 1000)
@@ -47,6 +61,9 @@ def reset(self) -> np.ndarray:
             self._init_flag = True
         obs = self._env._reset()
         obs = np.stack(obs)
+        self.pre_action = np.zeros(self._cfg.num_agents, dtype=np.int32)
+        pre_action_prob = np.zeros((self._cfg.num_agents, self._action_space.n))
+        obs = np.concatenate([obs, pre_action_prob], axis=1)
         obs = to_ndarray(obs).astype(np.float32)
         self._eval_episode_return = np.zeros((self._cfg.num_agents, 1), dtype=np.float32)
         return obs
@@ -63,13 +80,20 @@ def seed(self, seed: int, dynamic_seed: bool = True) -> None:
 
     def step(self, action: Union[np.ndarray, list]) -> BaseEnvTimestep:
         action = to_ndarray(action)
+        if (len(action.shape) == 1):
+            action = np.expand_dims(action, axis=1)
         obs, rew, done, order_param, ups, downs = self._env._step(action)
-        info = {"order_param": order_param, "ups": ups, "downs": downs}
+        info = {"order_param": order_param, "ups": ups, "downs": downs, 'pre_action': self.pre_action}
+        pre_action_prob = self.calculate_action_prob(self.pre_action)
+        self.pre_action = action
         obs = np.stack(obs)
+        obs = np.concatenate([obs, pre_action_prob], axis=1)
         obs = to_ndarray(obs).astype(np.float32)
         rew = np.stack(rew)
         rew = to_ndarray(rew).astype(np.float32)
         self._eval_episode_return += rew
+
+        done = done[0]  # dones are the same for all agents
         if done:
             info['eval_episode_return'] = self._eval_episode_return
         return BaseEnvTimestep(obs, rew, done, info)
diff --git a/dizoo/ising_env/envs/test_ising_model_env.py b/dizoo/ising_env/envs/test_ising_model_env.py
@@ -9,12 +9,12 @@
 @pytest.mark.envtest
 class TestIsingModelEnv:
 
-    def test_ising(self):
+    def test_ising():
         env = IsingModelEnv(EasyDict({'num_agents': num_agents, 'dim_spin': 2, 'agent_view_sight': 1}))
         env.seed(314, dynamic_seed=False)
         assert env._seed == 314
         obs = env.reset()
-        assert obs.shape == (100, 4)
+        assert obs.shape == (100, 4 + 2)
         for _ in range(5):
             env.reset()
             np.random.seed(314)
@@ -31,7 +31,7 @@ def test_ising(self):
                 print('timestep', timestep, '\n')
                 assert isinstance(timestep.obs, np.ndarray)
                 assert isinstance(timestep.done[0], bool)
-                assert timestep.obs.shape == (100, 4)
+                assert timestep.obs.shape == (100, 4 + 2)
                 assert timestep.reward.shape == (100, 1)
                 assert timestep.reward[0] >= env.reward_space.low
                 assert timestep.reward[0] <= env.reward_space.high