werner-duvaud
diff --git a/‎diagnose_model.py
+9-17 b/‎diagnose_model.py
+9-17
diff --git a/‎game_history.py
+6-12 b/‎game_history.py
+6-12
diff --git a/‎games/breakout.py
+6-2 b/‎games/breakout.py
+6-2
diff --git a/‎games/cartpole.py
+11-7 b/‎games/cartpole.py
+11-7
diff --git a/‎games/connect4.py
+7-3 b/‎games/connect4.py
+7-3
diff --git a/‎mcts.py
+8-21 b/‎mcts.py
+8-21
diff --git a/‎models/muzero_network.py
+1-1 b/‎models/muzero_network.py
+1-1
@@ -54,18 +54,18 @@ def get_virtual_trajectory_from_obs(
                 virtual_to_play = self.config.players[0]
 
             # Generate new root
-            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
+            value, reward, policy_parameters, hidden_state = self.model.recurrent_inference(
                 root.hidden_state,
                 torch.tensor([[action]]).to(root.hidden_state.device),
             )
             value = support_to_scalar(value, self.config.support_size).item()
             reward = support_to_scalar(reward, self.config.support_size).item()
             root = Node(0)
+            sampled_actions = self.model.sample_actions(policy_parameters)
             root.expand(
-                self.config.action_space,
+                sampled_actions,
                 virtual_to_play,
                 reward,
-                policy_logits,
                 hidden_state,
             )
 
@@ -208,10 +208,10 @@ def __init__(self, title, config):
         self.policies_after_planning = []
         # Not implemented, need to store them in every nodes of the mcts
         self.prior_values = []
-        self.values_after_planning = [[numpy.NaN] * len(self.config.action_space)]
+        self.values_after_planning = [[numpy.NaN] * sum(self.config.action_shape)]
         self.prior_root_value = []
         self.root_value_after_planning = []
-        self.prior_rewards = [[numpy.NaN] * len(self.config.action_space)]
+        self.prior_rewards = [[numpy.NaN] * sum(self.config.action_shape)]
         self.mcts_depth = []
 
     def store_info(self, root, mcts_info, action, reward, new_prior_root_value=None):
@@ -222,25 +222,19 @@ def store_info(self, root, mcts_info, action, reward, new_prior_root_value=None)
         self.prior_policies.append(
             [
                 root.children[action].prior
-                if action in root.children.keys()
-                else numpy.NaN
-                for action in self.config.action_space
+                for action in root.children.keys()
             ]
         )
         self.policies_after_planning.append(
             [
                 root.children[action].visit_count / self.config.num_simulations
-                if action in root.children.keys()
-                else numpy.NaN
-                for action in self.config.action_space
+                for action in root.children.keys()
             ]
         )
         self.values_after_planning.append(
             [
                 root.children[action].value()
-                if action in root.children.keys()
-                else numpy.NaN
-                for action in self.config.action_space
+                for action in root.children.keys()
             ]
         )
         self.prior_root_value.append(
@@ -252,9 +246,7 @@ def store_info(self, root, mcts_info, action, reward, new_prior_root_value=None)
         self.prior_rewards.append(
             [
                 root.children[action].reward
-                if action in root.children.keys()
-                else numpy.NaN
-                for action in self.config.action_space
+                for action in root.children.keys()
             ]
         )
         self.mcts_depth.append(mcts_info["max_tree_depth"])
 
@@ -12,30 +12,24 @@ def __init__(self):
         self.to_play_history = []
         self.child_visits = []
         self.root_values = []
+        self.sampled_actions_history = []
         self.reanalysed_predicted_root_values = None
         # For PER
         self.priorities = None
         self.game_priority = None
 
-    def store_search_statistics(self, root, action_space):
+    def store_search_statistics(self, root):
         # Turn visit count from root into a policy
         if root is not None:
             sum_visits = sum(child.visit_count for child in root.children.values())
-            self.child_visits.append(
-                [
-                    root.children[a].visit_count / sum_visits
-                    if a in root.children
-                    else 0
-                    for a in action_space
-                ]
-            )
-
+            self.child_visits.append([root.children[a].visit_count / sum_visits for a in root.children.keys()])
+            self.sampled_actions_history.append(root.sampled_actions)
             self.root_values.append(root.value())
         else:
             self.root_values.append(None)
 
     def get_stacked_observations(
-        self, index, num_stacked_observations, action_space_size
+            self, index, num_stacked_observations
     ):
         """
         Generate a new observation with the observation at the index position
@@ -55,7 +49,7 @@ def get_stacked_observations(
                         [
                             numpy.ones_like(stacked_observations[0])
                             * self.action_history[past_observation_index + 1]
-                            / action_space_size
+                            / len(self.sampled_actions_history[past_observation_index + 1])
                         ],
                     )
                 )
 
@@ -4,6 +4,7 @@
 import gym
 import numpy
 import torch
+from torch.distributions import Categorical
 
 from .abstract_game import AbstractGame
 
@@ -54,7 +55,7 @@ def __init__(self):
 
 
         ### Network
-        self.network = "resnet"  # "resnet" / "fullyconnected"
+        self.network = "sampled"  # "resnet" / "fullyconnected" / "sampled"
         self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward)))
 
         # Residual Network
@@ -76,7 +77,10 @@ def __init__(self):
         self.fc_value_layers = []  # Define the hidden layers in the value network
         self.fc_policy_layers = []  # Define the hidden layers in the policy network
 
-
+        # Sampled
+        self.sample_size = 4
+        self.action_shape = [4]
+        self.policy_distribution = Categorical
 
         ### Training
         self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")  # Path to store the model weights and TensorBoard logs
 
@@ -49,9 +49,9 @@ def __init__(self):
 
 
         ### Network
-        self.network = "fullyconnected"  # "resnet" / "fullyconnected"
+        self.network = "sampled"  # "resnet" / "fullyconnected"
         self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward)))
-        
+
         # Residual Network
         self.downsample = False  # Downsample observations before representation network, False / "CNN" (lighter) / "resnet" (See paper appendix Network Architecture)
         self.blocks = 1  # Number of blocks in the ResNet
@@ -66,18 +66,22 @@ def __init__(self):
         # Fully Connected Network
         self.encoding_size = 8
         self.fc_representation_layers = []  # Define the hidden layers in the representation network
-        self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
-        self.fc_reward_layers = [16]  # Define the hidden layers in the reward network
-        self.fc_value_layers = [16]  # Define the hidden layers in the value network
-        self.fc_policy_layers = [16]  # Define the hidden layers in the policy network
+        self.fc_dynamics_layers = [32]  # Define the hidden layers in the dynamics network
+        self.fc_reward_layers = [32]  # Define the hidden layers in the reward network
+        self.fc_value_layers = [32]  # Define the hidden layers in the value network
+        self.fc_policy_layers = [128, 128]  # Define the hidden layers in the policy network
 
 
+        # Sampled
+        self.sample_size = 50
+        self.action_shape = [2]
+        self.policy_distribution = torch.distributions.Categorical
 
         ### Training
         self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")  # Path to store the model weights and TensorBoard logs
         self.save_model = True  # Save the checkpoint in results_path as model.checkpoint
         self.training_steps = 10000  # Total number of training steps (ie weights update according to a batch)
-        self.batch_size = 128  # Number of parts of games to train on at each training step
+        self.batch_size = 256  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 1  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
         self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
 
@@ -3,6 +3,7 @@
 
 import numpy
 import torch
+from torch.distributions import Categorical
 
 from .abstract_game import AbstractGame
 
@@ -48,7 +49,7 @@ def __init__(self):
 
 
         ### Network
-        self.network = "resnet"  # "resnet" / "fullyconnected"
+        self.network = "sampled"  # "resnet" / "fullyconnected"
         self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward)))
 
         # Residual Network
@@ -70,14 +71,17 @@ def __init__(self):
         self.fc_value_layers = []  # Define the hidden layers in the value network
         self.fc_policy_layers = []  # Define the hidden layers in the policy network
 
-
+        # Sampled
+        self.sample_size = 7
+        self.action_shape = [7]
+        self.policy_distribution = Categorical
 
         ### Training
         self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")  # Path to store the model weights and TensorBoard logs
         self.save_model = True  # Save the checkpoint in results_path as model.checkpoint
         self.training_steps = 100000  # Total number of training steps (ie weights update according to a batch)
         self.batch_size = 64  # Number of parts of games to train on at each training step
-        self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
+        self.checkpoint_interval = 200  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
         self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
 
 
@@ -45,27 +45,19 @@ def run(
                 .unsqueeze(0)
                 .to(next(model.parameters()).device)
             )
-            (
-                root_predicted_value,
-                reward,
-                policy_logits,
-                hidden_state,
-            ) = model.initial_inference(observation)
+            root_predicted_value, reward, policy_parameters, hidden_state = model.initial_inference(observation)
             root_predicted_value = support_to_scalar(
                 root_predicted_value, self.config.support_size
             ).item()
             reward = support_to_scalar(reward, self.config.support_size).item()
             assert (
                 legal_actions
             ), f"Legal actions should not be an empty array. Got {legal_actions}."
-            assert set(legal_actions).issubset(
-                set(self.config.action_space)
-            ), "Legal actions should be a subset of the action space."
+            sampled_actions = model.sample_actions(policy_parameters)
             root.expand(
-                legal_actions,
+                sampled_actions,
                 to_play,
                 reward,
-                policy_logits,
                 hidden_state,
             )
 
@@ -98,17 +90,17 @@ def run(
             # Inside the search tree we use the dynamics function to obtain the next hidden
             # state given an action and the previous hidden state
             parent = search_path[-2]
-            value, reward, policy_logits, hidden_state = model.recurrent_inference(
+            value, reward, policy_parameters, hidden_state = model.recurrent_inference(
                 parent.hidden_state,
                 torch.tensor([[action]]).to(parent.hidden_state.device),
             )
+            sampled_actions = model.sample_actions(policy_parameters)
             value = support_to_scalar(value, self.config.support_size).item()
             reward = support_to_scalar(reward, self.config.support_size).item()
             node.expand(
-                self.config.action_space,
+                sampled_actions,
                 virtual_to_play,
                 reward,
-                policy_logits,
                 hidden_state,
             )
 
@@ -130,13 +122,8 @@ def select_child(self, node, min_max_stats):
             self.ucb_score(node, child, min_max_stats)
             for action, child in node.children.items()
         )
-        action = numpy.random.choice(
-            [
-                action
-                for action, child in node.children.items()
-                if self.ucb_score(node, child, min_max_stats) == max_ucb
-            ]
-        )
+        actions = [action for action, child in node.children.items() if self.ucb_score(node, child, min_max_stats) == max_ucb]
+        action = actions[numpy.random.choice(range(len(actions)))]
         return action, node.children[action]
 
     def ucb_score(self, parent, child, min_max_stats):
 
@@ -42,7 +42,7 @@ def __new__(cls, config):
                 config.action_shape,
                 config.encoding_size,
                 config.sample_size,
-                config.blocks,
+                config.policy_distribution,
                 config.fc_reward_layers,
                 config.fc_value_layers,
                 config.fc_policy_layers,
Original file line number	Diff line number	Diff line change
`@@ -54,18 +54,18 @@ def get_virtual_trajectory_from_obs(`
`54`	`54`	`virtual_to_play = self.config.players[0]`
`55`	`55`
`56`	`56`	`# Generate new root`
`57`		`- value, reward, policy_logits, hidden_state = self.model.recurrent_inference(`
	`57`	`+ value, reward, policy_parameters, hidden_state = self.model.recurrent_inference(`
`58`	`58`	`root.hidden_state,`
`59`	`59`	`torch.tensor([[action]]).to(root.hidden_state.device),`
`60`	`60`	`)`
`61`	`61`	`value = support_to_scalar(value, self.config.support_size).item()`
`62`	`62`	`reward = support_to_scalar(reward, self.config.support_size).item()`
`63`	`63`	`root = Node(0)`
	`64`	`+ sampled_actions = self.model.sample_actions(policy_parameters)`
`64`	`65`	`root.expand(`
`65`		`- self.config.action_space,`
	`66`	`+ sampled_actions,`
`66`	`67`	`virtual_to_play,`
`67`	`68`	`reward,`
`68`		`- policy_logits,`
`69`	`69`	`hidden_state,`
`70`	`70`	`)`
`71`	`71`
`@@ -208,10 +208,10 @@ def __init__(self, title, config):`
`208`	`208`	`self.policies_after_planning = []`
`209`	`209`	`# Not implemented, need to store them in every nodes of the mcts`
`210`	`210`	`self.prior_values = []`
`211`		`- self.values_after_planning = [[numpy.NaN] * len(self.config.action_space)]`
	`211`	`+ self.values_after_planning = [[numpy.NaN] * sum(self.config.action_shape)]`
`212`	`212`	`self.prior_root_value = []`
`213`	`213`	`self.root_value_after_planning = []`
`214`		`- self.prior_rewards = [[numpy.NaN] * len(self.config.action_space)]`
	`214`	`+ self.prior_rewards = [[numpy.NaN] * sum(self.config.action_shape)]`
`215`	`215`	`self.mcts_depth = []`
`216`	`216`
`217`	`217`	`def store_info(self, root, mcts_info, action, reward, new_prior_root_value=None):`
`@@ -222,25 +222,19 @@ def store_info(self, root, mcts_info, action, reward, new_prior_root_value=None)`
`222`	`222`	`self.prior_policies.append(`
`223`	`223`	`[`
`224`	`224`	`root.children[action].prior`
`225`		`- if action in root.children.keys()`
`226`		`- else numpy.NaN`
`227`		`- for action in self.config.action_space`
	`225`	`+ for action in root.children.keys()`
`228`	`226`	`]`
`229`	`227`	`)`
`230`	`228`	`self.policies_after_planning.append(`
`231`	`229`	`[`
`232`	`230`	`root.children[action].visit_count / self.config.num_simulations`
`233`		`- if action in root.children.keys()`
`234`		`- else numpy.NaN`
`235`		`- for action in self.config.action_space`
	`231`	`+ for action in root.children.keys()`
`236`	`232`	`]`
`237`	`233`	`)`
`238`	`234`	`self.values_after_planning.append(`
`239`	`235`	`[`
`240`	`236`	`root.children[action].value()`
`241`		`- if action in root.children.keys()`
`242`		`- else numpy.NaN`
`243`		`- for action in self.config.action_space`
	`237`	`+ for action in root.children.keys()`
`244`	`238`	`]`
`245`	`239`	`)`
`246`	`240`	`self.prior_root_value.append(`
`@@ -252,9 +246,7 @@ def store_info(self, root, mcts_info, action, reward, new_prior_root_value=None)`
`252`	`246`	`self.prior_rewards.append(`
`253`	`247`	`[`
`254`	`248`	`root.children[action].reward`
`255`		`- if action in root.children.keys()`
`256`		`- else numpy.NaN`
`257`		`- for action in self.config.action_space`
	`249`	`+ for action in root.children.keys()`
`258`	`250`	`]`
`259`	`251`	`)`
`260`	`252`	`self.mcts_depth.append(mcts_info["max_tree_depth"])`