forked from PolynomialTime/TRRL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
53 lines (41 loc) · 1.4 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""This is a simple example demonstrating how to clone the behavior of an expert.
Refer to the jupyter notebooks for more detailed examples of how to use the algorithms.
"""
import torch
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.ppo import MlpPolicy
# test
from imitation.algorithms import bc
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.policies.serialize import load_policy
from imitation.util.util import make_vec_env
import gymnasium as gym
from imitation.rewards.reward_nets import BasicRewardNet
from imitation.rewards.reward_wrapper import RewardVecEnvWrapper
from reward_function import RwdFromRwdNet
'''
rng = np.random.default_rng(0)
env = make_vec_env(
"CartPole-v1",
n_envs=4,
rng=rng,
#post_wrappers=[lambda env, _: RolloutInfoWrapper(env)], # for computing rollouts
)
_ = env.reset()
rwd_net = BasicRewardNet(env.unwrapped.envs[0].unwrapped.observation_space, env.unwrapped.envs[0].unwrapped.action_space)
rwd_fn = RwdFromRwdNet(rwd_net=rwd_net)
wenv = RewardVecEnvWrapper(
venv=env,
reward_fn=rwd_fn
)
starting_action = np.array([0], dtype=np.integer)
#actions = np.repeat(starting_action, repeats=[4], axis=0)
#print(wenv.step(actions))
loss = torch.zeros((1,1))
print(loss)
'''
for i in range(0, 10, 3):
print(i)