opendilab · jayyoung0802 · Oct 14, 2023 · Oct 14, 2023 · Oct 16, 2023 · Oct 20, 2023
diff --git a/lzero/mcts/buffer/game_buffer.py b/lzero/mcts/buffer/game_buffer.py
@@ -118,33 +118,41 @@ def _sample_orig_data(self, batch_size: int) -> Tuple:
 
         # +1e-6 for numerical stability
         probs = self.game_pos_priorities ** self._alpha + 1e-6
-        probs /= probs.sum()
+        if self._cfg.multi_agent:
+            probs = np.array([probs[i] for i in range(0, len(probs), self._cfg.model.agent_num)])  #TODO: check this
+            probs /= probs.sum()
+        else:
+            probs /= probs.sum()
 
         # sample according to transition index
         # TODO(pu): replace=True
-        batch_index_list = np.random.choice(num_of_transitions, batch_size, p=probs, replace=False)
+        batch_index_list = np.random.choice(num_of_transitions//self._cfg.model.agent_num, batch_size, p=probs, replace=False)
 
         if self._cfg.reanalyze_outdated is True:
             # NOTE: used in reanalyze part
             batch_index_list.sort()
 
-        weights_list = (num_of_transitions * probs[batch_index_list]) ** (-self._beta)
+        weights_list = ((num_of_transitions//self._cfg.model.agent_num) * probs[batch_index_list]) ** (-self._beta)
         weights_list /= weights_list.max()
 
         game_segment_list = []
         pos_in_game_segment_list = []
+        agent_id_list = []
+        true_batch_index_list = []
 
         for idx in batch_index_list:
-            game_segment_idx, pos_in_game_segment = self.game_segment_game_pos_look_up[idx]
+            game_segment_idx, pos_in_game_segment, agent_id = self.game_segment_game_pos_look_up[idx]
             game_segment_idx -= self.base_idx
-            game_segment = self.game_segment_buffer[game_segment_idx]
-
-            game_segment_list.append(game_segment)
-            pos_in_game_segment_list.append(pos_in_game_segment)
+            for i in range(self._cfg.model.agent_num):
+                game_segment = self.game_segment_buffer[game_segment_idx*self._cfg.model.agent_num+i]
+                game_segment_list.append(game_segment)
+                pos_in_game_segment_list.append(pos_in_game_segment)
+                agent_id_list.append(agent_id+i)
+                true_batch_index_list.append(idx)
 
-        make_time = [time.time() for _ in range(len(batch_index_list))]
+        make_time = [time.time() for _ in range(len(true_batch_index_list))]
 
-        orig_data = (game_segment_list, pos_in_game_segment_list, batch_index_list, weights_list, make_time)
+        orig_data = (game_segment_list, pos_in_game_segment_list, true_batch_index_list, weights_list, make_time)
         return orig_data
 
     def _preprocess_to_play_and_action_mask(
@@ -349,8 +357,9 @@ def _push_game_segment(self, data: Any, meta: Optional[dict] = None) -> None:
             self.game_pos_priorities = np.concatenate((self.game_pos_priorities, priorities))
 
         self.game_segment_buffer.append(data)
+        agent_id = data.obs_segment[0]['agent_id']
         self.game_segment_game_pos_look_up += [
-            (self.base_idx + len(self.game_segment_buffer) - 1, step_pos) for step_pos in range(len(data))
+            (self.base_idx + len(self.game_segment_buffer) - 1, step_pos, agent_id) for step_pos in range(len(data))
         ]
 
     def remove_oldest_data_to_fit(self) -> None:

diff --git a/lzero/mcts/buffer/game_buffer_efficientzero.py b/lzero/mcts/buffer/game_buffer_efficientzero.py
@@ -9,6 +9,8 @@
 from lzero.mcts.utils import prepare_observation
 from lzero.policy import to_detach_cpu_numpy, concat_output, concat_output_value, inverse_scalar_transform
 from .game_buffer_muzero import MuZeroGameBuffer
+from ding.torch_utils import to_device, to_tensor
+from ding.utils.data import default_collate
 
 
 @BUFFER_REGISTRY.register('game_buffer_efficientzero')
@@ -101,6 +103,14 @@ def _prepare_reward_value_context(
               td_steps_list, action_mask_segment, to_play_segment
         """
         zero_obs = game_segment_list[0].zero_obs()
+        # zero_obs = np.array([{'agent_state': np.zeros((18,), dtype=np.float32),
+        #              'global_state': np.zeros((48,), dtype=np.float32),
+        #              'agent_alone_state': np.zeros((14,), dtype=np.float32),
+        #              'agent_alone_padding_state': np.zeros((18,), dtype=np.float32),}])
+        zero_obs = np.array([{'agent_state': np.zeros((6,), dtype=np.float32),
+                'global_state': np.zeros((14, ), dtype=np.float32),
+                'agent_alone_state': np.zeros((12,), dtype=np.float32),
+                'agent_alone_padding_state': np.zeros((12,), dtype=np.float32),}])
         value_obs_list = []
         # the value is valid or not (out of trajectory)
         value_mask = []
@@ -152,7 +162,7 @@ def _prepare_reward_value_context(
                     value_mask.append(0)
                     obs = zero_obs
 
-                value_obs_list.append(obs)
+                value_obs_list.append(obs.tolist())
 
         reward_value_context = [
             value_obs_list, value_mask, pos_in_game_segment_list, rewards_list, game_segment_lens, td_steps_list,
@@ -196,7 +206,13 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
                 beg_index = self._cfg.mini_infer_size * i
                 end_index = self._cfg.mini_infer_size * (i + 1)
 
-                m_obs = torch.from_numpy(value_obs_list[beg_index:end_index]).to(self._cfg.device).float()
+                if self._cfg.model.model_type and self._cfg.model.model_type in ['conv', 'mlp']:
+                    m_obs = torch.from_numpy(value_obs_list[beg_index:end_index]).to(self._cfg.device).float()
+                elif self._cfg.model.model_type and self._cfg.model.model_type == 'structure':
+                    m_obs = value_obs_list[beg_index:end_index]
+                    m_obs = sum(m_obs, [])
+                    m_obs = default_collate(m_obs)
+                    m_obs = to_device(m_obs, self._cfg.device)
 
                 # calculate the target value
                 m_output = model.initial_inference(m_obs)
@@ -205,13 +221,16 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
                     # EfficientZero related core code
                     # ==============================================================
                     # if not in training, obtain the scalars of the value/reward
-                    [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
-                        [
-                            m_output.latent_state,
-                            inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
-                            m_output.policy_logits
-                        ]
-                    )
+                    # [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
+                    #     [
+                    #         m_output.latent_state,
+                    #         inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
+                    #         m_output.policy_logits
+                    #     ]
+                    # )
+                    m_output.latent_state = (to_detach_cpu_numpy(m_output.latent_state[0]), to_detach_cpu_numpy(m_output.latent_state[1]))
+                    m_output.value = to_detach_cpu_numpy(inverse_scalar_transform(m_output.value, self._cfg.model.support_scale))
+                    m_output.policy_logits = to_detach_cpu_numpy(m_output.policy_logits)
                     m_output.reward_hidden_state = (
                         m_output.reward_hidden_state[0].detach().cpu().numpy(),
                         m_output.reward_hidden_state[1].detach().cpu().numpy()

diff --git a/lzero/mcts/buffer/game_buffer_muzero.py b/lzero/mcts/buffer/game_buffer_muzero.py
@@ -9,6 +9,8 @@
 from lzero.mcts.utils import prepare_observation
 from lzero.policy import to_detach_cpu_numpy, concat_output, concat_output_value, inverse_scalar_transform
 from .game_buffer import GameBuffer
+from ding.torch_utils import to_device, to_tensor
+from ding.utils.data import default_collate
 
 if TYPE_CHECKING:
     from lzero.policy import MuZeroPolicy, EfficientZeroPolicy, SampledEfficientZeroPolicy
@@ -199,6 +201,17 @@ def _prepare_reward_value_context(
               td_steps_list, action_mask_segment, to_play_segment
         """
         zero_obs = game_segment_list[0].zero_obs()
+        zero_obs = np.array([{
+                            'agent_id': np.array(0),
+                            'agent_state': np.zeros((18,), dtype=np.float32),
+                            'global_state': np.zeros((30,), dtype=np.float32),
+                            'agent_alone_state': np.zeros((14,), dtype=np.float32),
+                            'agent_alone_padding_state': np.zeros((18,), dtype=np.float32),
+                        }])
+        # zero_obs = np.array([{'agent_state': np.zeros((6,), dtype=np.float32),
+        #         'global_state': np.zeros((14, ), dtype=np.float32),
+        #         'agent_alone_state': np.zeros((12,), dtype=np.float32),
+        #         'agent_alone_padding_state': np.zeros((12,), dtype=np.float32),}])
         value_obs_list = []
         # the value is valid or not (out of game_segment)
         value_mask = []
@@ -208,7 +221,7 @@ def _prepare_reward_value_context(
         action_mask_segment, to_play_segment = [], []
 
         td_steps_list = []
-        for game_segment, state_index, idx in zip(game_segment_list, pos_in_game_segment_list, batch_index_list):
+        for game_segment, state_index in zip(game_segment_list, pos_in_game_segment_list):
             game_segment_len = len(game_segment)
             game_segment_lens.append(game_segment_len)
 
@@ -242,7 +255,7 @@ def _prepare_reward_value_context(
                     value_mask.append(0)
                     obs = zero_obs
 
-                value_obs_list.append(obs)
+                value_obs_list.append(obs.tolist())
 
         reward_value_context = [
             value_obs_list, value_mask, pos_in_game_segment_list, rewards_list, game_segment_lens, td_steps_list,
@@ -377,21 +390,29 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
                 beg_index = self._cfg.mini_infer_size * i
                 end_index = self._cfg.mini_infer_size * (i + 1)
 
-                m_obs = torch.from_numpy(value_obs_list[beg_index:end_index]).to(self._cfg.device).float()
+                if self._cfg.model.model_type and self._cfg.model.model_type in ['conv', 'mlp']:
+                    m_obs = torch.from_numpy(value_obs_list[beg_index:end_index]).to(self._cfg.device).float()
+                elif self._cfg.model.model_type and self._cfg.model.model_type == 'structure':
+                    m_obs = value_obs_list[beg_index:end_index]
+                    m_obs = sum(m_obs, [])
+                    m_obs = default_collate(m_obs)
+                    m_obs = to_device(m_obs, self._cfg.device)
 
                 # calculate the target value
                 m_output = model.initial_inference(m_obs)
 
                 if not model.training:
                     # if not in training, obtain the scalars of the value/reward
-                    [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
-                        [
-                            m_output.latent_state,
-                            inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
-                            m_output.policy_logits
-                        ]
-                    )
-
+                    # [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
+                    #     [
+                    #         m_output.latent_state,
+                    #         inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
+                    #         m_output.policy_logits
+                    #     ]
+                    # )
+                    m_output.latent_state = (to_detach_cpu_numpy(m_output.latent_state[0]), to_detach_cpu_numpy(m_output.latent_state[1]))
+                    m_output.value = to_detach_cpu_numpy(inverse_scalar_transform(m_output.value, self._cfg.model.support_scale))
+                    m_output.policy_logits = to_detach_cpu_numpy(m_output.policy_logits)
                 network_output.append(m_output)
 
             # concat the output slices after model inference

diff --git a/lzero/mcts/ctree/ctree_sampled_efficientzero/lib/cnode.cpp b/lzero/mcts/ctree/ctree_sampled_efficientzero/lib/cnode.cpp
@@ -381,7 +381,7 @@ namespace tree
             for (size_t iter = 0; iter < disturbed_probs.size(); iter++)
             {
                 #ifdef __APPLE__
-                    disc_action_with_probs.__emplace_back(std::make_pair(iter, disturbed_probs[iter]));
+                    disc_action_with_probs.emplace_back(std::make_pair(iter, disturbed_probs[iter]));
                 #else
                     disc_action_with_probs.emplace_back(std::make_pair(iter, disturbed_probs[iter]));
                 #endif