Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature(yzj): add ptz ctde pipeline #149

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
31 changes: 20 additions & 11 deletions lzero/mcts/buffer/game_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,33 +118,41 @@ def _sample_orig_data(self, batch_size: int) -> Tuple:

# +1e-6 for numerical stability
probs = self.game_pos_priorities ** self._alpha + 1e-6
probs /= probs.sum()
if self._cfg.multi_agent:
probs = np.array([probs[i] for i in range(0, len(probs), self._cfg.model.agent_num)]) #TODO: check this
probs /= probs.sum()
else:
probs /= probs.sum()

# sample according to transition index
# TODO(pu): replace=True
batch_index_list = np.random.choice(num_of_transitions, batch_size, p=probs, replace=False)
batch_index_list = np.random.choice(num_of_transitions//self._cfg.model.agent_num, batch_size, p=probs, replace=False)

if self._cfg.reanalyze_outdated is True:
# NOTE: used in reanalyze part
batch_index_list.sort()

weights_list = (num_of_transitions * probs[batch_index_list]) ** (-self._beta)
weights_list = ((num_of_transitions//self._cfg.model.agent_num) * probs[batch_index_list]) ** (-self._beta)
weights_list /= weights_list.max()

game_segment_list = []
pos_in_game_segment_list = []
agent_id_list = []
true_batch_index_list = []

for idx in batch_index_list:
game_segment_idx, pos_in_game_segment = self.game_segment_game_pos_look_up[idx]
game_segment_idx, pos_in_game_segment, agent_id = self.game_segment_game_pos_look_up[idx]
game_segment_idx -= self.base_idx
game_segment = self.game_segment_buffer[game_segment_idx]

game_segment_list.append(game_segment)
pos_in_game_segment_list.append(pos_in_game_segment)
for i in range(self._cfg.model.agent_num):
game_segment = self.game_segment_buffer[game_segment_idx*self._cfg.model.agent_num+i]
game_segment_list.append(game_segment)
pos_in_game_segment_list.append(pos_in_game_segment)
agent_id_list.append(agent_id+i)
true_batch_index_list.append(idx)

make_time = [time.time() for _ in range(len(batch_index_list))]
make_time = [time.time() for _ in range(len(true_batch_index_list))]

orig_data = (game_segment_list, pos_in_game_segment_list, batch_index_list, weights_list, make_time)
orig_data = (game_segment_list, pos_in_game_segment_list, true_batch_index_list, weights_list, make_time)
return orig_data

def _preprocess_to_play_and_action_mask(
Expand Down Expand Up @@ -349,8 +357,9 @@ def _push_game_segment(self, data: Any, meta: Optional[dict] = None) -> None:
self.game_pos_priorities = np.concatenate((self.game_pos_priorities, priorities))

self.game_segment_buffer.append(data)
agent_id = data.obs_segment[0]['agent_id']
self.game_segment_game_pos_look_up += [
(self.base_idx + len(self.game_segment_buffer) - 1, step_pos) for step_pos in range(len(data))
(self.base_idx + len(self.game_segment_buffer) - 1, step_pos, agent_id) for step_pos in range(len(data))
]

def remove_oldest_data_to_fit(self) -> None:
Expand Down
37 changes: 28 additions & 9 deletions lzero/mcts/buffer/game_buffer_efficientzero.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from lzero.mcts.utils import prepare_observation
from lzero.policy import to_detach_cpu_numpy, concat_output, concat_output_value, inverse_scalar_transform
from .game_buffer_muzero import MuZeroGameBuffer
from ding.torch_utils import to_device, to_tensor
from ding.utils.data import default_collate


@BUFFER_REGISTRY.register('game_buffer_efficientzero')
Expand Down Expand Up @@ -101,6 +103,14 @@ def _prepare_reward_value_context(
td_steps_list, action_mask_segment, to_play_segment
"""
zero_obs = game_segment_list[0].zero_obs()
# zero_obs = np.array([{'agent_state': np.zeros((18,), dtype=np.float32),
# 'global_state': np.zeros((48,), dtype=np.float32),
# 'agent_alone_state': np.zeros((14,), dtype=np.float32),
# 'agent_alone_padding_state': np.zeros((18,), dtype=np.float32),}])
zero_obs = np.array([{'agent_state': np.zeros((6,), dtype=np.float32),
'global_state': np.zeros((14, ), dtype=np.float32),
'agent_alone_state': np.zeros((12,), dtype=np.float32),
'agent_alone_padding_state': np.zeros((12,), dtype=np.float32),}])
value_obs_list = []
# the value is valid or not (out of trajectory)
value_mask = []
Expand Down Expand Up @@ -152,7 +162,7 @@ def _prepare_reward_value_context(
value_mask.append(0)
obs = zero_obs

value_obs_list.append(obs)
value_obs_list.append(obs.tolist())

reward_value_context = [
value_obs_list, value_mask, pos_in_game_segment_list, rewards_list, game_segment_lens, td_steps_list,
Expand Down Expand Up @@ -196,7 +206,13 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
beg_index = self._cfg.mini_infer_size * i
end_index = self._cfg.mini_infer_size * (i + 1)

m_obs = torch.from_numpy(value_obs_list[beg_index:end_index]).to(self._cfg.device).float()
if self._cfg.model.model_type and self._cfg.model.model_type in ['conv', 'mlp']:
m_obs = torch.from_numpy(value_obs_list[beg_index:end_index]).to(self._cfg.device).float()
elif self._cfg.model.model_type and self._cfg.model.model_type == 'structure':
m_obs = value_obs_list[beg_index:end_index]
m_obs = sum(m_obs, [])
m_obs = default_collate(m_obs)
m_obs = to_device(m_obs, self._cfg.device)

# calculate the target value
m_output = model.initial_inference(m_obs)
Expand All @@ -205,13 +221,16 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
# EfficientZero related core code
# ==============================================================
# if not in training, obtain the scalars of the value/reward
[m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
[
m_output.latent_state,
inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
m_output.policy_logits
]
)
# [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
# [
# m_output.latent_state,
# inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
# m_output.policy_logits
# ]
# )
m_output.latent_state = (to_detach_cpu_numpy(m_output.latent_state[0]), to_detach_cpu_numpy(m_output.latent_state[1]))
m_output.value = to_detach_cpu_numpy(inverse_scalar_transform(m_output.value, self._cfg.model.support_scale))
m_output.policy_logits = to_detach_cpu_numpy(m_output.policy_logits)
m_output.reward_hidden_state = (
m_output.reward_hidden_state[0].detach().cpu().numpy(),
m_output.reward_hidden_state[1].detach().cpu().numpy()
Expand Down
43 changes: 32 additions & 11 deletions lzero/mcts/buffer/game_buffer_muzero.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from lzero.mcts.utils import prepare_observation
from lzero.policy import to_detach_cpu_numpy, concat_output, concat_output_value, inverse_scalar_transform
from .game_buffer import GameBuffer
from ding.torch_utils import to_device, to_tensor
from ding.utils.data import default_collate

if TYPE_CHECKING:
from lzero.policy import MuZeroPolicy, EfficientZeroPolicy, SampledEfficientZeroPolicy
Expand Down Expand Up @@ -199,6 +201,17 @@ def _prepare_reward_value_context(
td_steps_list, action_mask_segment, to_play_segment
"""
zero_obs = game_segment_list[0].zero_obs()
zero_obs = np.array([{
'agent_id': np.array(0),
'agent_state': np.zeros((18,), dtype=np.float32),
'global_state': np.zeros((30,), dtype=np.float32),
'agent_alone_state': np.zeros((14,), dtype=np.float32),
'agent_alone_padding_state': np.zeros((18,), dtype=np.float32),
}])
# zero_obs = np.array([{'agent_state': np.zeros((6,), dtype=np.float32),
# 'global_state': np.zeros((14, ), dtype=np.float32),
# 'agent_alone_state': np.zeros((12,), dtype=np.float32),
# 'agent_alone_padding_state': np.zeros((12,), dtype=np.float32),}])
value_obs_list = []
# the value is valid or not (out of game_segment)
value_mask = []
Expand All @@ -208,7 +221,7 @@ def _prepare_reward_value_context(
action_mask_segment, to_play_segment = [], []

td_steps_list = []
for game_segment, state_index, idx in zip(game_segment_list, pos_in_game_segment_list, batch_index_list):
for game_segment, state_index in zip(game_segment_list, pos_in_game_segment_list):
game_segment_len = len(game_segment)
game_segment_lens.append(game_segment_len)

Expand Down Expand Up @@ -242,7 +255,7 @@ def _prepare_reward_value_context(
value_mask.append(0)
obs = zero_obs

value_obs_list.append(obs)
value_obs_list.append(obs.tolist())

reward_value_context = [
value_obs_list, value_mask, pos_in_game_segment_list, rewards_list, game_segment_lens, td_steps_list,
Expand Down Expand Up @@ -377,21 +390,29 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
beg_index = self._cfg.mini_infer_size * i
end_index = self._cfg.mini_infer_size * (i + 1)

m_obs = torch.from_numpy(value_obs_list[beg_index:end_index]).to(self._cfg.device).float()
if self._cfg.model.model_type and self._cfg.model.model_type in ['conv', 'mlp']:
m_obs = torch.from_numpy(value_obs_list[beg_index:end_index]).to(self._cfg.device).float()
elif self._cfg.model.model_type and self._cfg.model.model_type == 'structure':
m_obs = value_obs_list[beg_index:end_index]
m_obs = sum(m_obs, [])
m_obs = default_collate(m_obs)
m_obs = to_device(m_obs, self._cfg.device)

# calculate the target value
m_output = model.initial_inference(m_obs)

if not model.training:
# if not in training, obtain the scalars of the value/reward
[m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
[
m_output.latent_state,
inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
m_output.policy_logits
]
)

# [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
# [
# m_output.latent_state,
# inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
# m_output.policy_logits
# ]
# )
m_output.latent_state = (to_detach_cpu_numpy(m_output.latent_state[0]), to_detach_cpu_numpy(m_output.latent_state[1]))
m_output.value = to_detach_cpu_numpy(inverse_scalar_transform(m_output.value, self._cfg.model.support_scale))
m_output.policy_logits = to_detach_cpu_numpy(m_output.policy_logits)
network_output.append(m_output)

# concat the output slices after model inference
Expand Down
2 changes: 1 addition & 1 deletion lzero/mcts/ctree/ctree_sampled_efficientzero/lib/cnode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ namespace tree
for (size_t iter = 0; iter < disturbed_probs.size(); iter++)
{
#ifdef __APPLE__
disc_action_with_probs.__emplace_back(std::make_pair(iter, disturbed_probs[iter]));
disc_action_with_probs.emplace_back(std::make_pair(iter, disturbed_probs[iter]));
#else
disc_action_with_probs.emplace_back(std::make_pair(iter, disturbed_probs[iter]));
#endif
Expand Down
Loading