fix(rjy): try to fix reward problem

nighood · nighood · commit 3d18513fca2e · 2024-03-28T16:33:08.000+08:00
diff --git a/ding/model/common/head.py b/ding/model/common/head.py
@@ -61,6 +61,7 @@ def __init__(
                 norm_type=norm_type
             ), block(hidden_size, output_size)
         )
+        nn.init.normal_(self.Q[1].weight, 0, 0.2)
 
     def forward(self, x: torch.Tensor) -> Dict:
         """
diff --git a/ding/policy/common_utils.py b/ding/policy/common_utils.py
@@ -62,7 +62,9 @@ def default_preprocess_learn(
         if len(reward.shape) == 1:
             reward = reward.unsqueeze(1)
         # reward: (batch_size, nstep) -> (nstep, batch_size)
-        data['reward'] = reward.permute(1, 0).contiguous()
+        # reversed_shape = [i for i in range(len(reward.shape))][::-1]
+        # data['reward'] = reward.permute(reversed_shape).contiguous()
+        data['reward'] = reward.transpose(0, -1).contiguous()
     else:
         if data['reward'].dim() == 2 and data['reward'].shape[1] == 1:
             data['reward'] = data['reward'].squeeze(-1)
diff --git a/ding/rl_utils/adder.py b/ding/rl_utils/adder.py
@@ -121,7 +121,7 @@ def get_nstep_return_data(
         """
         if nstep == 1:
             return data
-        fake_reward = torch.zeros(1)
+        fake_reward = torch.zeros_like(data[0]['reward'])
         next_obs_flag = 'next_obs' in data[0]
         for i in range(len(data) - nstep):
             # update keys ['next_obs', 'reward', 'done'] with their n-step value
@@ -130,7 +130,7 @@ def get_nstep_return_data(
             if cum_reward:
                 data[i]['reward'] = sum([data[i + j]['reward'] * (gamma ** j) for j in range(nstep)])
             else:
-                data[i]['reward'] = torch.cat([data[i + j]['reward'] for j in range(nstep)])
+                data[i]['reward'] = torch.stack([data[i + j]['reward'] for j in range(nstep)], dim = -1)
             data[i]['done'] = data[i + nstep - 1]['done']
             if correct_terminate_gamma:
                 data[i]['value_gamma'] = gamma ** nstep
@@ -140,10 +140,15 @@ def get_nstep_return_data(
             if cum_reward:
                 data[i]['reward'] = sum([data[i + j]['reward'] * (gamma ** j) for j in range(len(data) - i)])
             else:
-                data[i]['reward'] = torch.cat(
+                data[i]['reward'] = torch.stack(
                     [data[i + j]['reward']
-                     for j in range(len(data) - i)] + [fake_reward for _ in range(nstep - (len(data) - i))]
+                     for j in range(len(data) - i)] + [fake_reward for _ in range(nstep - (len(data) - i))],
+                    dim = -1
                 )
+            try:
+                assert len(data[i]['reward']) == 300
+            except:
+                print(len(data[i]['reward']))
             data[i]['done'] = data[-1]['done']
             if correct_terminate_gamma:
                 data[i]['value_gamma'] = gamma ** (len(data) - i - 1)
diff --git a/dizoo/ising_env/config/ising_mfq_config.py b/dizoo/ising_env/config/ising_mfq_config.py
@@ -12,7 +12,6 @@
         collector_env_num=8,
         evaluator_env_num=8,
         n_evaluator_episode=8,
-        stop_value=20,
         num_agents=num_agents,
         dim_spin=dim_spin,
         agent_view_sight=agent_view_sight,
@@ -62,4 +61,4 @@
 if __name__ == '__main__':
     # or you can enter `ding -m serial -c ising_mfq_config.py -s 0`
     from ding.entry import serial_pipeline
-    serial_pipeline((main_config, create_config), seed=0)
+    serial_pipeline((main_config, create_config), seed=0, max_env_step=1e5)
diff --git a/dizoo/ising_env/envs/ising_model_env.py b/dizoo/ising_env/envs/ising_model_env.py
@@ -30,15 +30,20 @@ def __init__(self, cfg: dict) -> None:
 
     def calculate_action_prob(self, actions):
         num_action = self._action_space.n
-        N = actions.shape[0]
+        N = actions.shape[0]    # agent_num
         # Convert actions to one_hot encoding
         one_hot_actions = np.eye(num_action)[actions.flatten()]
         action_prob = np.zeros((N, num_action))
 
         for i in range(N):
-            # Exclude agent i's actions and calculate the one_hot average of all other agent actions
-            exclude_current = np.delete(one_hot_actions, i, axis=0)
-            action_prob[i] = exclude_current.mean(axis=0)
+            # Select only the one_hot actions of agents visible to agent i
+            visible_actions = one_hot_actions[self._env.agents[i].spin_mask == 1]
+            if visible_actions.size > 0:
+                # Calculate the average of the one_hot encoding for visible agents only
+                action_prob[i] = visible_actions.mean(axis=0)
+            else:
+                # If no visible agents, action_prob remains zero for agent i
+                action_prob[i] = np.zeros(num_action)
 
         return action_prob
 
@@ -62,10 +67,11 @@ def reset(self) -> np.ndarray:
         obs = self._env._reset()
         obs = np.stack(obs)
         self.pre_action = np.zeros(self._cfg.num_agents, dtype=np.int32)
-        pre_action_prob = np.zeros((self._cfg.num_agents, self._action_space.n))
+        # consider the last global state as pre action prob
+        pre_action_prob = self.calculate_action_prob(self._env.world.global_state.flatten().astype(int))
         obs = np.concatenate([obs, pre_action_prob], axis=1)
         obs = to_ndarray(obs).astype(np.float32)
-        self._eval_episode_return = np.zeros((self._cfg.num_agents, 1), dtype=np.float32)
+        self._eval_episode_return = 0
         return obs
 
     def close(self) -> None:
@@ -90,8 +96,9 @@ def step(self, action: Union[np.ndarray, list]) -> BaseEnvTimestep:
         obs = np.concatenate([obs, pre_action_prob], axis=1)
         obs = to_ndarray(obs).astype(np.float32)
         rew = np.stack(rew)
-        rew = to_ndarray(rew).astype(np.float32)
-        self._eval_episode_return += rew
+        rew = np.squeeze(to_ndarray(rew).astype(np.float32), axis=1)
+        # rew = to_ndarray(rew).astype(np.float32)
+        self._eval_episode_return += np.sum(rew)
 
         done = done[0]  # dones are the same for all agents
         if done:

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,7 @@ def __init__(`
`61`	`61`	`norm_type=norm_type`
`62`	`62`	`), block(hidden_size, output_size)`
`63`	`63`	`)`
	`64`	`+ nn.init.normal_(self.Q[1].weight, 0, 0.2)`
`64`	`65`
`65`	`66`	`def forward(self, x: torch.Tensor) -> Dict:`
`66`	`67`	`"""`