-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathrunner.py
133 lines (111 loc) · 5.23 KB
/
runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
This is the machinnery that runs your agent in an environment.
"""
import matplotlib.pyplot as plt
import numpy as np
import agent
class Runner:
def __init__(self, environment, agent, verbose=False):
self.environment = environment
self.agent = agent
self.verbose = verbose
def step(self):
observation = self.environment.observe().clone()
action = self.agent.act(observation).copy()
(reward, done) = self.environment.act(action)
self.agent.reward(observation, action, reward,done)
return (observation, action, reward, done)
def loop(self, games,nbr_epoch, max_iter):
cumul_reward = 0.0
list_cumul_reward=[]
list_optimal_ratio = []
list_aprox_ratio =[]
for epoch_ in range(nbr_epoch):
print(" -> epoch : "+str(epoch_))
for g in range(1, games + 1):
print(" -> games : "+str(g))
for epoch in range(5):
self.environment.reset(g)
self.agent.reset(g)
cumul_reward = 0.0
for i in range(1, max_iter + 1):
# if self.verbose:
# print("Simulation step {}:".format(i))
(obs, act, rew, done) = self.step()
cumul_reward += rew
if self.verbose:
# print(" -> observation: {}".format(obs))
# print(" -> action: {}".format(act))
# print(" -> reward: {}".format(rew))
# print(" -> cumulative reward: {}".format(cumul_reward))
if done:
#solution from baseline algorithm
approx_sol =self.environment.get_approx()
#optimal solution
optimal_sol = self.environment.get_optimal_sol()
# print cumulative reward of one play, it is actually the solution found by the NN algorithm
print(" -> Terminal event: cumulative rewards = {}".format(cumul_reward))
#print optimal solution
print(" -> Optimal solution = {}".format(optimal_sol))
#we add in a list the solution found by the NN algorithm
list_cumul_reward.append(-cumul_reward)
#we add in a list the ratio between the NN solution and the optimal solution
list_optimal_ratio.append(cumul_reward/(optimal_sol))
#we add in a list the ratio between the NN solution and the baseline solution
list_aprox_ratio.append(cumul_reward/(approx_sol))
if done:
break
np.savetxt('test_'+str(epoch_)+'.out', list_optimal_ratio, delimiter=',')
np.savetxt('test_approx_' + str(epoch_) + '.out', list_aprox_ratio, delimiter=',')
if self.verbose:
print(" <=> Finished game number: {} <=>".format(g))
print("")
np.savetxt('test.out', list_cumul_reward, delimiter=',')
np.savetxt('opt_set.out', list_optimal_ratio, delimiter=',')
#plt.plot(list_cumul_reward)
#plt.show()
return cumul_reward
def iter_or_loopcall(o, count):
if callable(o):
return [ o() for _ in range(count) ]
else:
# must be iterable
return list(iter(o))
class BatchRunner:
"""
Runs several instances of the same RL problem in parallel
and aggregates the results.
"""
def __init__(self, env_maker, agent_maker, count, verbose=False):
self.environments = iter_or_loopcall(env_maker, count)
self.agents = iter_or_loopcall(agent_maker, count)
assert(len(self.agents) == len(self.environments))
self.verbose = verbose
self.ended = [ False for _ in self.environments ]
def game(self, max_iter):
rewards = []
for (agent, env) in zip(self.agents, self.environments):
env.reset()
agent.reset()
game_reward = 0
for i in range(1, max_iter+1):
observation = env.observe()
action = agent.act(observation)
(reward, stop) = env.act(action)
agent.reward(observation, action, reward)
game_reward += reward
if stop :
break
rewards.append(game_reward)
return sum(rewards)/len(rewards)
def loop(self, games,nb_epoch, max_iter):
cum_avg_reward = 0.0
for epoch in range(nb_epoch):
for g in range(1, games+1):
avg_reward = self.game(max_iter)
cum_avg_reward += avg_reward
if self.verbose:
print("Simulation game {}:".format(g))
print(" -> average reward: {}".format(avg_reward))
print(" -> cumulative average reward: {}".format(cum_avg_reward))
return cum_avg_reward