Merge pull request #17 from openai/backprop

Add backprop support + simple BC example
openai · Jul 26, 2022 · 96b094e · 96b094e
2 parents f63a391 + b61623e
commit 96b094e
Show file tree

Hide file tree

Showing 7 changed files with 478 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -96,6 +96,27 @@ A window should pop up which shows the video frame-by-frame, showing the predict
 
 Note that `run_inverse_dynamics_model.py` is designed to be a demo of the IDM, not code to put it into practice.
 
+# Using behavioural cloning to fine-tune the models
+
+**Disclaimer:** This code is a rough demonstration only and not an exact recreation of what original VPT paper did (but it contains some preprocessing steps you want to be aware of)! As such, do not expect replicate the original experiments with this code. This code has been designed to be run-able on consumer hardware (e.g., 8GB of VRAM).
+
+Setup:
+* Install requirements: `pip install -r requirements.txt`
+* Download `.weights` and `.model` file for model you want to fine-tune.
+* Download contractor data (below) and place the `.mp4` and `.jsonl` files to the same directory (e.g., `data`). With default settings, you need at least 12 recordings.
+
+If you downloaded the "1x Width" models and placed some data under `data` directory, you can perform finetuning with
+
+```
+python behavioural_cloning.py --data-dir data --in-model foundation-model-1x.model --in-weights foundation-model-1x.weights --out-weights finetuned-1x.weights
+```
+
+You can then use `finetuned-1x.weights` when running the agent. You can change the training settings at the top of `behavioural_cloning.py`.
+
+Major limitations:
+- Only trains single step at the time, i.e., errors are not propagated through timesteps.
+- Computes gradients one sample at a time to keep memory use low, but also slows down the code.
+
 # Contractor Demonstrations
 
 ### Versions

diff --git a/agent.py b/agent.py
@@ -139,7 +139,11 @@ def reset(self):
         self.hidden_state = self.policy.initial_state(1)
 
     def _env_obs_to_agent(self, minerl_obs):
-        """Turn observation from MineRL environment into model's observation"""
+        """
+        Turn observation from MineRL environment into model's observation
+
+        Returns torch tensors.
+        """
         agent_input = resize_image(minerl_obs["pov"], AGENT_RESOLUTION)[None]
         agent_input = {"img": th.from_numpy(agent_input).to(self.device)}
         return agent_input
@@ -149,17 +153,39 @@ def _agent_action_to_env(self, agent_action):
         # This is quite important step (for some reason).
         # For the sake of your sanity, remember to do this step (manual conversion to numpy)
         # before proceeding. Otherwise, your agent might be a little derp.
-        action = {
-            "buttons": agent_action["buttons"].cpu().numpy(),
-            "camera": agent_action["camera"].cpu().numpy()
-        }
+        action = agent_action
+        if isinstance(action["buttons"], th.Tensor):
+            action = {
+                "buttons": agent_action["buttons"].cpu().numpy(),
+                "camera": agent_action["camera"].cpu().numpy()
+            }
         minerl_action = self.action_mapper.to_factored(action)
         minerl_action_transformed = self.action_transformer.policy2env(minerl_action)
         return minerl_action_transformed
 
-    def _env_action_to_agent(self, minerl_action):
-        """Turn action from MineRL to model's action"""
-        raise NotImplementedError()
+    def _env_action_to_agent(self, minerl_action_transformed, to_torch=False, check_if_null=False):
+        """
+        Turn action from MineRL to model's action.
+
+        Note that this will add batch dimensions to the action.
+        Returns numpy arrays, unless `to_torch` is True, in which case it returns torch tensors.
+
+        If `check_if_null` is True, check if the action is null (no action) after the initial
+        transformation. This matches the behaviour done in OpenAI's VPT work.
+        If action is null, return "None" instead
+        """
+        minerl_action = self.action_transformer.env2policy(minerl_action_transformed)
+        if check_if_null:
+            if np.all(minerl_action["buttons"] == 0) and np.all(minerl_action["camera"] == self.action_transformer.camera_zero_bin):
+                return None
+
+        # Add batch dims if not existant
+        if minerl_action["camera"].ndim == 1:
+            minerl_action = {k: v[None] for k, v in minerl_action.items()}
+        action = self.action_mapper.from_factored(minerl_action)
+        if to_torch:
+            action = {k: th.from_numpy(v).to(self.device) for k, v in action.items()}
+        return action
 
     def get_action(self, minerl_obs):
         """
@@ -177,4 +203,4 @@ def get_action(self, minerl_obs):
             stochastic=True
         )
         minerl_action = self._agent_action_to_env(agent_action)
-        return minerl_action
+        return minerl_action
diff --git a/behavioural_cloning.py b/behavioural_cloning.py
@@ -0,0 +1,143 @@
+# Basic behavioural cloning
+# Note: this uses gradient accumulation in batches of ones
+#       to perform training.
+#       This will fit inside even smaller GPUs (tested on 8GB one),
+#       but is slow.
+# NOTE: This is _not_ the original code used for VPT!
+#       This is merely to illustrate how to fine-tune the models and includes
+#       the processing steps used.
+
+# This will likely be much worse than what original VPT did:
+# we are not training on full sequences, but only one step at a time to save VRAM.
+
+from argparse import ArgumentParser
+import pickle
+import time
+
+import gym
+import minerl
+import torch as th
+import numpy as np
+
+from agent import PI_HEAD_KWARGS, MineRLAgent
+from data_loader import DataLoader
+from lib.tree_util import tree_map
+
+EPOCHS = 2
+# Needs to be <= number of videos
+BATCH_SIZE = 8
+# Ideally more than batch size to create
+# variation in datasets (otherwise, you will
+# get a bunch of consecutive samples)
+# Decrease this (and batch_size) if you run out of memory
+N_WORKERS = 12
+DEVICE = "cuda"
+
+LOSS_REPORT_RATE = 100
+
+LEARNING_RATE = 0.000181
+WEIGHT_DECAY = 0.039428
+MAX_GRAD_NORM = 5.0
+
+def load_model_parameters(path_to_model_file):
+    agent_parameters = pickle.load(open(path_to_model_file, "rb"))
+    policy_kwargs = agent_parameters["model"]["args"]["net"]["args"]
+    pi_head_kwargs = agent_parameters["model"]["args"]["pi_head_opts"]
+    pi_head_kwargs["temperature"] = float(pi_head_kwargs["temperature"])
+    return policy_kwargs, pi_head_kwargs
+
+def behavioural_cloning_train(data_dir, in_model, in_weights, out_weights):
+    agent_policy_kwargs, agent_pi_head_kwargs = load_model_parameters(in_model)
+
+    # To create model with the right environment.
+    # All basalt environments have the same settings, so any of them works here
+    env = gym.make("MineRLBasaltFindCave-v0")
+    agent = MineRLAgent(env, device=DEVICE, policy_kwargs=agent_policy_kwargs, pi_head_kwargs=agent_pi_head_kwargs)
+    agent.load_weights(in_weights)
+    env.close()
+
+    policy = agent.policy
+    trainable_parameters = policy.parameters()
+
+    # Parameters taken from the OpenAI VPT paper
+    optimizer = th.optim.Adam(
+        trainable_parameters,
+        lr=LEARNING_RATE,
+        weight_decay=WEIGHT_DECAY
+    )
+
+    data_loader = DataLoader(
+        dataset_dir=data_dir,
+        n_workers=N_WORKERS,
+        batch_size=BATCH_SIZE,
+        n_epochs=EPOCHS
+    )
+
+    start_time = time.time()
+
+    # Keep track of the hidden state per episode/trajectory.
+    # DataLoader provides unique id for each episode, which will
+    # be different even for the same trajectory when it is loaded
+    # up again
+    episode_hidden_states = {}
+    dummy_first = th.from_numpy(np.array((False,))).to(DEVICE)
+
+    loss_sum = 0
+    for batch_i, (batch_images, batch_actions, batch_episode_id) in enumerate(data_loader):
+        batch_loss = 0
+        for image, action, episode_id in zip(batch_images, batch_actions, batch_episode_id):
+            agent_action = agent._env_action_to_agent(action, to_torch=True, check_if_null=True)
+            if agent_action is None:
+                # Action was null
+                continue
+
+            agent_obs = agent._env_obs_to_agent({"pov": image})
+            if episode_id not in episode_hidden_states:
+                # TODO need to clean up this hidden state after worker is done with the work item.
+                #      Leaks memory, but not tooooo much at these scales (will be a problem later).
+                episode_hidden_states[episode_id] = policy.initial_state(1)
+            agent_state = episode_hidden_states[episode_id]
+
+            pi_distribution, v_prediction, new_agent_state = policy.get_output_for_observation(
+                agent_obs,
+                agent_state,
+                dummy_first
+            )
+
+            log_prob  = policy.get_logprob_of_action(pi_distribution, agent_action)
+
+            # Make sure we do not try to backprop through sequence
+            # (fails with current accumulation)
+            new_agent_state = tree_map(lambda x: x.detach(), new_agent_state)
+            episode_hidden_states[episode_id] = new_agent_state
+
+            # Finally, update the agent to increase the probability of the
+            # taken action.
+            # Remember to take mean over batch losses
+            loss = -log_prob / BATCH_SIZE
+            batch_loss += loss.item()
+            loss.backward()
+
+        th.nn.utils.clip_grad_norm_(trainable_parameters, MAX_GRAD_NORM)
+        optimizer.step()
+        optimizer.zero_grad()
+
+        loss_sum += batch_loss
+        if batch_i % LOSS_REPORT_RATE == 0:
+            time_since_start = time.time() - start_time
+            print(f"Time: {time_since_start:.2f}, Batches: {batch_i}, Avrg loss: {loss_sum / LOSS_REPORT_RATE:.4f}")
+            loss_sum = 0
+
+    state_dict = policy.state_dict()
+    th.save(state_dict, out_weights)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--data-dir", type=str, required=True, help="Path to the directory containing recordings to be trained on")
+    parser.add_argument("--in-model", required=True, type=str, help="Path to the .model file to be finetuned")
+    parser.add_argument("--in-weights", required=True, type=str, help="Path to the .weights file to be finetuned")
+    parser.add_argument("--out-weights", required=True, type=str, help="Path where finetuned weights will be saved")
+
+    args = parser.parse_args()
+    behavioural_cloning_train(args.data_dir, args.in_model, args.in_weights, args.out_weights)
diff --git a/cursors/mouse_cursor_white_16x16.png b/cursors/mouse_cursor_white_16x16.png