polish(rjy): fix dqn net init

nighood · nighood · commit 7ac3e7d8ebb1 · 2024-04-07T16:43:09.000+08:00
diff --git a/ding/model/common/head.py b/ding/model/common/head.py
@@ -61,7 +61,6 @@ def __init__(
                 norm_type=norm_type
             ), block(hidden_size, output_size)
         )
-        nn.init.normal_(self.Q[1].weight, 0, 0.2)
 
     def forward(self, x: torch.Tensor) -> Dict:
         """
diff --git a/dizoo/ising_env/config/ising_mfq_config.py b/dizoo/ising_env/config/ising_mfq_config.py
@@ -1,4 +1,5 @@
 from easydict import EasyDict
+from ding.utils import set_pkg_seed
 
 obs_shape = 4
 action_shape = 2
@@ -7,7 +8,7 @@
 agent_view_sight = 1
 
 ising_mfq_config = dict(
-    exp_name='ising_mfq_seed0',
+    exp_name='ising_mfq_seed0_debug',
     env=dict(
         collector_env_num=8,
         evaluator_env_num=8,
@@ -61,4 +62,10 @@
 if __name__ == '__main__':
     # or you can enter `ding -m serial -c ising_mfq_config.py -s 0`
     from ding.entry import serial_pipeline
-    serial_pipeline((main_config, create_config), seed=0, max_env_step=5e4)
+    from ding.model import DQN
+    seed = 1
+    set_pkg_seed(seed)
+    model = DQN(**ising_mfq_config.policy.model)
+    model.head.A[-1][0].bias.data.fill_(0)  # zero last layer bias
+    # print("init model successful")
+    serial_pipeline((main_config, create_config), seed=seed, model=model, max_env_step=5e4)
diff --git a/dizoo/ising_env/envs/ising_model_env.py b/dizoo/ising_env/envs/ising_model_env.py
@@ -30,7 +30,7 @@ def __init__(self, cfg: dict) -> None:
 
     def calculate_action_prob(self, actions):
         num_action = self._action_space.n
-        N = actions.shape[0]    # agent_num
+        N = actions.shape[0]  # agent_num
         # Convert actions to one_hot encoding
         one_hot_actions = np.eye(num_action)[actions.flatten()]
         action_prob = np.zeros((N, num_action))
@@ -84,9 +84,9 @@ def seed(self, seed: int, dynamic_seed: bool = True) -> None:
         self._dynamic_seed = dynamic_seed
         np.random.seed(self._seed)
 
-    def step(self, action: Union[np.ndarray, list]) -> BaseEnvTimestep:
+    def step(self, action: np.ndarray) -> BaseEnvTimestep:
         action = to_ndarray(action)
-        if (len(action.shape) == 1):
+        if len(action.shape) == 1:
             action = np.expand_dims(action, axis=1)
         obs, rew, done, order_param, ups, downs = self._env._step(action)
         info = {"order_param": order_param, "ups": ups, "downs": downs, 'pre_action': self.pre_action}
@@ -95,9 +95,7 @@ def step(self, action: Union[np.ndarray, list]) -> BaseEnvTimestep:
         obs = np.stack(obs)
         obs = np.concatenate([obs, pre_action_prob], axis=1)
         obs = to_ndarray(obs).astype(np.float32)
-        rew = np.stack(rew)
-        rew = np.squeeze(to_ndarray(rew).astype(np.float32), axis=1)
-        # rew = to_ndarray(rew).astype(np.float32)
+        rew = np.concatenate(rew)
         self._eval_episode_return += np.sum(rew)
 
         done = done[0]  # dones are the same for all agents
diff --git a/dizoo/ising_env/envs/test_ising_model_env.py b/dizoo/ising_env/envs/test_ising_model_env.py
@@ -9,12 +9,12 @@
 @pytest.mark.envtest
 class TestIsingModelEnv:
 
-    def test_ising():
+    def test_ising(self):
         env = IsingModelEnv(EasyDict({'num_agents': num_agents, 'dim_spin': 2, 'agent_view_sight': 1}))
         env.seed(314, dynamic_seed=False)
         assert env._seed == 314
         obs = env.reset()
-        assert obs.shape == (100, 4 + 2)
+        assert obs.shape == (num_agents, 4 + 2)
         for _ in range(5):
             env.reset()
             np.random.seed(314)
@@ -30,9 +30,9 @@ def test_ising():
                 timestep = env.step(random_action)
                 print('timestep', timestep, '\n')
                 assert isinstance(timestep.obs, np.ndarray)
-                assert isinstance(timestep.done[0], bool)
-                assert timestep.obs.shape == (100, 4 + 2)
-                assert timestep.reward.shape == (100, 1)
+                assert isinstance(timestep.done, bool)
+                assert timestep.obs.shape == (num_agents, 4 + 2)
+                assert timestep.reward.shape == (num_agents, )
                 assert timestep.reward[0] >= env.reward_space.low
                 assert timestep.reward[0] <= env.reward_space.high
         print(env.observation_space, env.action_space, env.reward_space)

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,6 @@ def __init__(`
`61`	`61`	`norm_type=norm_type`
`62`	`62`	`), block(hidden_size, output_size)`
`63`	`63`	`)`
`64`		`- nn.init.normal_(self.Q[1].weight, 0, 0.2)`
`65`	`64`
`66`	`65`	`def forward(self, x: torch.Tensor) -> Dict:`
`67`	`66`	`"""`