diff --git a/mjrl/utils/gym_env.py b/mjrl/utils/gym_env.py index 9061b05..eac628c 100644 --- a/mjrl/utils/gym_env.py +++ b/mjrl/utils/gym_env.py @@ -1,5 +1,6 @@ import gym import numpy as np +from tqdm import tqdm class EnvSpec(object): def __init__(self, obs_dim, act_dim, horizon, num_agents): @@ -79,6 +80,7 @@ def evaluate_policy(self, policy, if seed is not None: self.env._seed(seed) horizon = self._horizon if horizon is None else horizon + mean_eval, std, min_eval, max_eval = 0.0, 0.0, -1e8, -1e8 ep_returns = np.zeros(num_episodes) @@ -123,3 +125,77 @@ def evaluate_policy(self, policy, full_dist = ep_returns return [base_stats, percentile_stats, full_dist] + + def evaluate_policy_vil(self, policy, + use_tactile, + num_episodes=5, + horizon=None, + gamma=1, + percentile=[], + get_full_dist=False, + mean_action=False, + terminate_at_done=True, + seed=None, + camera_name=None, + device_id=0, + use_cuda=False, + frame_size=(128, 128)): + + if seed is not None: + self.env.env._seed(seed) + np.random.seed(seed) + + horizon = self._horizon if horizon is None else horizon + mean_eval, std, min_eval, max_eval = 0.0, 0.0, -1e8, -1e8 + ep_returns = np.zeros(num_episodes) + + for ep in tqdm(range(num_episodes)): + + self.reset() + robot_info = self.env.env.get_proprioception(use_tactile=use_tactile) + + path_image_pixels = [] + t, done = 0, False + while t < horizon and not (done and terminate_at_done): + image_pix = self.env.env.get_pixels(frame_size=frame_size, camera_name=camera_name, device_id=device_id) + img = image_pix + prev_img = image_pix + prev_prev_img = image_pix + if t > 0: + prev_img = path_image_pixels[t - 1] + + if t > 1: + prev_prev_img = path_image_pixels[t - 2] + path_image_pixels.append(img) + prev_prev_img = np.expand_dims(prev_prev_img, axis=0) + prev_img = np.expand_dims(prev_img, axis=0) + img = np.expand_dims(img, axis=0) + + o = np.concatenate((prev_prev_img, prev_img, img), axis=0) + + if mean_action: + a = policy.get_action(o, robot_info=robot_info, use_cuda=use_cuda)[1]['mean'] + else: + a = policy.get_action(o, robot_info=robot_info, use_cuda=use_cuda)[0] + + o, r, done, _ = self.step(a) + + robot_info = self.env.env.get_proprioception(use_tactile=use_tactile) + + ep_returns[ep] += (gamma ** t) * r + t += 1 + + mean_eval, std = np.mean(ep_returns), np.std(ep_returns) + min_eval, max_eval = np.amin(ep_returns), np.amax(ep_returns) + base_stats = [mean_eval, std, min_eval, max_eval] + + percentile_stats = [] + full_dist = [] + + for p in percentile: + percentile_stats.append(np.percentile(ep_returns, p)) + + if get_full_dist == True: + full_dist = ep_returns + + return [base_stats, percentile_stats, full_dist]