-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtrain.py
185 lines (136 loc) · 6.2 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
""" Trains a model depending on """
import random
import sys
import tensorflow as tf
from helper import preprocess_image
from helper import save_figs
from setup import initialize_replay_memory
import numpy as np
import datetime
import time
#consider using global variables for this!!!
def train(env, parameter, saver, forward_dict, loss_dict):
""" Trains the network to the given environment. Saves weights to saver
In: env (OpenAI gym environment)
In: parameter (Dictionary with settings)
In: saver (tf.saver where weights and bias will be saved to)
Out: rewards_list (reward for instantenous run)
Out: steps_list (number of steps 'survived' in given episode)
"""
reward_list = []
steps_list = []
if parameter['OLD_TF']:
init = tf.global_variables_initializer() #replace for newer versions
else:
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
for i in xrange(parameter['NUM_EPISODES']):
start_time = datetime.datetime.now()
total_reward, steps = run_episode(
env=env,
cur_episode=i,
sess=sess,
parameter=parameter,
forward_dict=forward_dict,
loss_dict=loss_dict
)
end_time = datetime.datetime.now()
reward_list.append(total_reward)
steps_list.append(steps)
percentage = float(i)/parameter['NUM_EPISODES']
total_time=(end_time - start_time)
#print (i, parameter['NUM_EPISODES'])
if i % parameter['SAVE_EVERY'] == 0:
save_figs(reward_list, steps_list, parameter, i)
saver.save(sess, 'modelSpaceInvader.ckpt', global_step=i)
print "Progress: {0:.3f}%%".format(percentage * 100)
print "EST. time per episode: " + str(total_time)
print "Episodes left: {0:d}".format(parameter['NUM_EPISODES'] - i)
print "Percent of successful episodes: " + str(sum(reward_list)/parameter['NUM_EPISODES']) + "%"
print
save_figs(reward_list, steps_list, parameter, parameter['NUM_EPISODES'] + 1)
saver.save(sess, 'modelSpaceInvader.ckpt', global_step=parameter['NUM_EPISODES'] + 1)
return reward_list, steps_list
def run_episode(env, sess, cur_episode, parameter, forward_dict, loss_dict):
""" Run one episode of environment
In: env
In: cur_episode
In: parameter
Out: total_reward (accumulated over episode)
Out: steps (needed until termination)
"""
total_reward = 0
done = False
steps = 1
#######
##Creating initial 4 experiences
replay_memory, observation = initialize_replay_memory(4, env)
#######
##Actual loop
while steps < parameter['NUM_STEPS']:
steps += 1
new_observation, reward, done, p_observation, action, p_new_observation = step_environment(
env=env,
observation=observation,
sess=sess,
eps=parameter['EPS']/(cur_episode+1) + 0.06, #consider turning this into a lambda dictionary function
gamma=parameter['GAMMA'],
forward_dict=forward_dict,
loss_dict=loss_dict
)
total_reward += reward
aptuple = (p_observation, action, reward, p_new_observation)
replay_memory.append(aptuple)
########
# How often do we replay, and is this 'separate, unbatched way' even the correct way to do so?
for i in xrange(4):
replay(env=env,
aptuple=random.choice(replay_memory),
sess=sess,
gamma=parameter['GAMMA'],
forward_dict=forward_dict,
loss_dict=loss_dict
)
if done:
break
observation = new_observation
return total_reward, steps
def step_environment(env, observation, sess, eps, gamma, forward_dict, loss_dict):
""" Take one step within the environment """
""" In: env (OpenAI gym wrapper)
In: observation (current state of game)
In: sess (tf graph instance)
In: eps
In: gamma
Out: new_observation
Out: reward
Out: done
"""
##Policy forward
p_observation = preprocess_image(observation)
action, all_Qs = sess.run([forward_dict['predict'], forward_dict['Qout']], feed_dict={forward_dict['input']: p_observation}) #takes about 70% of the running time.. which is fine bcs that's the heart of the calculation
if np.random.rand(1) < eps:
action[0] = env.action_space.sample()
new_observation, reward, done, _ = env.step(action[0])
##Max Value forward
p_new_observation = preprocess_image(new_observation)
Q_next = sess.run([forward_dict['Qout']], feed_dict={forward_dict['input']: p_new_observation})
maxQ_next = np.max(Q_next)
targetQ = all_Qs
targetQ[0, action[0]] = reward + gamma * maxQ_next
##Update to more optimal features
sess.run([loss_dict['updateModel']], feed_dict={forward_dict['input']:p_new_observation, loss_dict['nextQ']: targetQ})
return new_observation, reward, done, p_observation, action, p_new_observation
def replay(env, aptuple, sess, gamma, forward_dict, loss_dict):
#Unpack values
p_observation, action, reward, p_new_observation = aptuple
#Detect predicted best action
action, all_Qs = sess.run([forward_dict['predict'], forward_dict['Qout']], feed_dict={forward_dict['input']: p_observation})
#Detect predicted best state (from current state)
Q_next = sess.run([forward_dict['Qout']], feed_dict={forward_dict['input']: p_new_observation})
#Apply backpropagation algorithm
maxQ_next = np.max(Q_next)
targetQ = all_Qs
targetQ[0, action[0]] = reward + gamma * maxQ_next
sess.run([loss_dict['updateModel']], feed_dict={forward_dict['input']:p_new_observation, loss_dict['nextQ']: targetQ})