Skip to content

Commit

Permalink
Merge pull request #17 from openai/backprop
Browse files Browse the repository at this point in the history
Add backprop support + simple BC example
  • Loading branch information
Miffyli authored Jul 26, 2022
2 parents f63a391 + b61623e commit 96b094e
Show file tree
Hide file tree
Showing 7 changed files with 478 additions and 19 deletions.
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,27 @@ A window should pop up which shows the video frame-by-frame, showing the predict

Note that `run_inverse_dynamics_model.py` is designed to be a demo of the IDM, not code to put it into practice.

# Using behavioural cloning to fine-tune the models

**Disclaimer:** This code is a rough demonstration only and not an exact recreation of what original VPT paper did (but it contains some preprocessing steps you want to be aware of)! As such, do not expect replicate the original experiments with this code. This code has been designed to be run-able on consumer hardware (e.g., 8GB of VRAM).

Setup:
* Install requirements: `pip install -r requirements.txt`
* Download `.weights` and `.model` file for model you want to fine-tune.
* Download contractor data (below) and place the `.mp4` and `.jsonl` files to the same directory (e.g., `data`). With default settings, you need at least 12 recordings.

If you downloaded the "1x Width" models and placed some data under `data` directory, you can perform finetuning with

```
python behavioural_cloning.py --data-dir data --in-model foundation-model-1x.model --in-weights foundation-model-1x.weights --out-weights finetuned-1x.weights
```

You can then use `finetuned-1x.weights` when running the agent. You can change the training settings at the top of `behavioural_cloning.py`.

Major limitations:
- Only trains single step at the time, i.e., errors are not propagated through timesteps.
- Computes gradients one sample at a time to keep memory use low, but also slows down the code.

# Contractor Demonstrations

### Versions
Expand Down
44 changes: 35 additions & 9 deletions agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,11 @@ def reset(self):
self.hidden_state = self.policy.initial_state(1)

def _env_obs_to_agent(self, minerl_obs):
"""Turn observation from MineRL environment into model's observation"""
"""
Turn observation from MineRL environment into model's observation
Returns torch tensors.
"""
agent_input = resize_image(minerl_obs["pov"], AGENT_RESOLUTION)[None]
agent_input = {"img": th.from_numpy(agent_input).to(self.device)}
return agent_input
Expand All @@ -149,17 +153,39 @@ def _agent_action_to_env(self, agent_action):
# This is quite important step (for some reason).
# For the sake of your sanity, remember to do this step (manual conversion to numpy)
# before proceeding. Otherwise, your agent might be a little derp.
action = {
"buttons": agent_action["buttons"].cpu().numpy(),
"camera": agent_action["camera"].cpu().numpy()
}
action = agent_action
if isinstance(action["buttons"], th.Tensor):
action = {
"buttons": agent_action["buttons"].cpu().numpy(),
"camera": agent_action["camera"].cpu().numpy()
}
minerl_action = self.action_mapper.to_factored(action)
minerl_action_transformed = self.action_transformer.policy2env(minerl_action)
return minerl_action_transformed

def _env_action_to_agent(self, minerl_action):
"""Turn action from MineRL to model's action"""
raise NotImplementedError()
def _env_action_to_agent(self, minerl_action_transformed, to_torch=False, check_if_null=False):
"""
Turn action from MineRL to model's action.
Note that this will add batch dimensions to the action.
Returns numpy arrays, unless `to_torch` is True, in which case it returns torch tensors.
If `check_if_null` is True, check if the action is null (no action) after the initial
transformation. This matches the behaviour done in OpenAI's VPT work.
If action is null, return "None" instead
"""
minerl_action = self.action_transformer.env2policy(minerl_action_transformed)
if check_if_null:
if np.all(minerl_action["buttons"] == 0) and np.all(minerl_action["camera"] == self.action_transformer.camera_zero_bin):
return None

# Add batch dims if not existant
if minerl_action["camera"].ndim == 1:
minerl_action = {k: v[None] for k, v in minerl_action.items()}
action = self.action_mapper.from_factored(minerl_action)
if to_torch:
action = {k: th.from_numpy(v).to(self.device) for k, v in action.items()}
return action

def get_action(self, minerl_obs):
"""
Expand All @@ -177,4 +203,4 @@ def get_action(self, minerl_obs):
stochastic=True
)
minerl_action = self._agent_action_to_env(agent_action)
return minerl_action
return minerl_action
143 changes: 143 additions & 0 deletions behavioural_cloning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# Basic behavioural cloning
# Note: this uses gradient accumulation in batches of ones
# to perform training.
# This will fit inside even smaller GPUs (tested on 8GB one),
# but is slow.
# NOTE: This is _not_ the original code used for VPT!
# This is merely to illustrate how to fine-tune the models and includes
# the processing steps used.

# This will likely be much worse than what original VPT did:
# we are not training on full sequences, but only one step at a time to save VRAM.

from argparse import ArgumentParser
import pickle
import time

import gym
import minerl
import torch as th
import numpy as np

from agent import PI_HEAD_KWARGS, MineRLAgent
from data_loader import DataLoader
from lib.tree_util import tree_map

EPOCHS = 2
# Needs to be <= number of videos
BATCH_SIZE = 8
# Ideally more than batch size to create
# variation in datasets (otherwise, you will
# get a bunch of consecutive samples)
# Decrease this (and batch_size) if you run out of memory
N_WORKERS = 12
DEVICE = "cuda"

LOSS_REPORT_RATE = 100

LEARNING_RATE = 0.000181
WEIGHT_DECAY = 0.039428
MAX_GRAD_NORM = 5.0

def load_model_parameters(path_to_model_file):
agent_parameters = pickle.load(open(path_to_model_file, "rb"))
policy_kwargs = agent_parameters["model"]["args"]["net"]["args"]
pi_head_kwargs = agent_parameters["model"]["args"]["pi_head_opts"]
pi_head_kwargs["temperature"] = float(pi_head_kwargs["temperature"])
return policy_kwargs, pi_head_kwargs

def behavioural_cloning_train(data_dir, in_model, in_weights, out_weights):
agent_policy_kwargs, agent_pi_head_kwargs = load_model_parameters(in_model)

# To create model with the right environment.
# All basalt environments have the same settings, so any of them works here
env = gym.make("MineRLBasaltFindCave-v0")
agent = MineRLAgent(env, device=DEVICE, policy_kwargs=agent_policy_kwargs, pi_head_kwargs=agent_pi_head_kwargs)
agent.load_weights(in_weights)
env.close()

policy = agent.policy
trainable_parameters = policy.parameters()

# Parameters taken from the OpenAI VPT paper
optimizer = th.optim.Adam(
trainable_parameters,
lr=LEARNING_RATE,
weight_decay=WEIGHT_DECAY
)

data_loader = DataLoader(
dataset_dir=data_dir,
n_workers=N_WORKERS,
batch_size=BATCH_SIZE,
n_epochs=EPOCHS
)

start_time = time.time()

# Keep track of the hidden state per episode/trajectory.
# DataLoader provides unique id for each episode, which will
# be different even for the same trajectory when it is loaded
# up again
episode_hidden_states = {}
dummy_first = th.from_numpy(np.array((False,))).to(DEVICE)

loss_sum = 0
for batch_i, (batch_images, batch_actions, batch_episode_id) in enumerate(data_loader):
batch_loss = 0
for image, action, episode_id in zip(batch_images, batch_actions, batch_episode_id):
agent_action = agent._env_action_to_agent(action, to_torch=True, check_if_null=True)
if agent_action is None:
# Action was null
continue

agent_obs = agent._env_obs_to_agent({"pov": image})
if episode_id not in episode_hidden_states:
# TODO need to clean up this hidden state after worker is done with the work item.
# Leaks memory, but not tooooo much at these scales (will be a problem later).
episode_hidden_states[episode_id] = policy.initial_state(1)
agent_state = episode_hidden_states[episode_id]

pi_distribution, v_prediction, new_agent_state = policy.get_output_for_observation(
agent_obs,
agent_state,
dummy_first
)

log_prob = policy.get_logprob_of_action(pi_distribution, agent_action)

# Make sure we do not try to backprop through sequence
# (fails with current accumulation)
new_agent_state = tree_map(lambda x: x.detach(), new_agent_state)
episode_hidden_states[episode_id] = new_agent_state

# Finally, update the agent to increase the probability of the
# taken action.
# Remember to take mean over batch losses
loss = -log_prob / BATCH_SIZE
batch_loss += loss.item()
loss.backward()

th.nn.utils.clip_grad_norm_(trainable_parameters, MAX_GRAD_NORM)
optimizer.step()
optimizer.zero_grad()

loss_sum += batch_loss
if batch_i % LOSS_REPORT_RATE == 0:
time_since_start = time.time() - start_time
print(f"Time: {time_since_start:.2f}, Batches: {batch_i}, Avrg loss: {loss_sum / LOSS_REPORT_RATE:.4f}")
loss_sum = 0

state_dict = policy.state_dict()
th.save(state_dict, out_weights)


if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--data-dir", type=str, required=True, help="Path to the directory containing recordings to be trained on")
parser.add_argument("--in-model", required=True, type=str, help="Path to the .model file to be finetuned")
parser.add_argument("--in-weights", required=True, type=str, help="Path to the .weights file to be finetuned")
parser.add_argument("--out-weights", required=True, type=str, help="Path where finetuned weights will be saved")

args = parser.parse_args()
behavioural_cloning_train(args.data_dir, args.in_model, args.in_weights, args.out_weights)
Binary file added cursors/mouse_cursor_white_16x16.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 96b094e

Please sign in to comment.