-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrl-q-rbf-cartpole.py
270 lines (227 loc) · 11.9 KB
/
rl-q-rbf-cartpole.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
# Solves the CartPole problem using Q-Learning and a RBF Network
#
# The meta parameters below are chosen, so that on average, the system needs
# 220 episodes to learn to balance "endlessly", i.e. 5,000 cycles. If this is not working
# at the first time, this might be due to the stochastic nature of the whole thing: Just run again.
#
# uses http://gym.openai.com/envs/CartPole-v1
# inspired by Lecture 17 of Udemy/Lazy Programmer Inc/Advanced AI: Deep Reinforcement Learning
#
# done by sy2002 on 11th of August 2018 and tuned on 17th of April 2019
# added the Disturb-Mode on 20th of July 2019
import gym
import gym.spaces
import numpy as np
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
# Basic idea of this "RBF Network" solution:
#
# * CartPole is offering 4 features: Cart Position, Cart Velocity, Pole Angle and Pole Velocity At Tip.
# * When we use a collection of (aka "network") of Radial Basis Functions (RBFs), then we can transform
# these 4 features into n distances from the centers of the RBFs, where n = RBF_EXEMPLARS x RBF_GAMMA_COUNT
# * The "Gamma" parameter controls the "width" of the RBF's bell curve. The idea is, to use multiple RBFSamplers
# with multiple widths to construct a network with a good variety to sample from.
# * The RBF transforms the initial 4 features into plenty of features and therefore offers a lot of "variables"
# or something like "resolution", where a Linear Regression algorithm can calcluate the weights for.
# In contrast, the original four features of the observation space would yield a too "low resolution".
# * The Reinforcement Learning algorithm used to learn to balance the pole is Q-Learning.
# * The "State" "s" of the Cart is obtained by using the above-mentioned RBF Network to transform the four
# original features into the n distances from a randomly chosen amount of RBF centers (aka "Exemplars").
# Note that it is absolutely OK, that the Exemplars are chosen randomly, because each Exemplar defines one
# possible combination of (Cart Position, Cart Velocity, Pole Angle and Pole Velocity At Tip); and therefore
# having lots of those random Exemplars just gives us the granularity ("resolution") we need for our Linear Regression.
# * The possible "Actions" "a" of the system are "push from left" or "push from right", aka 0 and 1.
# * For each possible action we are calculating one Value Function using Linear Regression. It can be illustrated by
# asking the question: "For the state we are currently in, defined by the distances to the Exemplars, what is the
# Value Function for pushing from left or puhsing from right. The larger one wins."
# On RBFSampler:
#
# Despite its name, the RBFSampler is actually not using any RBFs inside. I did some experiments about
# this fact. Go to https://github.com/sy2002/ai-playground/blob/master/RBFSampler-Experiment.ipynb
# to learn more.
RBF_EXEMPLARS = 250 # amount of exemplars per "gamma instance" of the RBF network
RBF_GAMMA_COUNT = 10 # amount of "gamma instances", i.e. RBF_EXEMPLARS x RBF_GAMMA_COUNT features
RBF_GAMMA_MIN = 0.05 # minimum gamma, linear interpolation between min and max
RBF_GAMMA_MAX = 4.0 # maximum gamma
RBF_SAMPLING = 30000 # amount of samples to take for sampling the observation space for fitting the Scaler
LEARN_EPISODES = 220 # number of episodes to learn
TEST_EPISODES = 10 # number of episodes that we use the visual rendering to test what we learned
GAMMA = 0.999 # discount factor for Q-Learning
ALPHA = 0.75 # initial learning rate
ALPHA_DECAY = 0.10 # learning rate decay
EPSILON = 0.5 # randomness for epsilon-greedy algorithm (explore vs exploit)
EPSILON_DECAY_t = 0.1
EPSILON_DECAY_m = 4 # every episode % EPSILON_DECAY_m == 0, we increase EPSILON_DECAY_t
PROBE = 20 # after how many episodes we will print the status
print("\nCartPole-v1 solver by sy2002 on 20th of July 2019")
print("=================================================\n")
print("Sampling observation space by playing", RBF_SAMPLING, "random episodes...")
# create environment
env = gym.make('CartPole-v1')
env_actions = [0, 1] # needs to be in ascending order with no gaps, e.g. [0, 1]
# CartPole has an extremely large observation space, as Gym's observation_space high/low
# attributes are basically returning the whole floating point range as possible values
# for some of the parameters.
# So let's sample possible values by running an amount of experiments.
# Caveat: This is only sampling a small amount of possible values, because running
# the CartPole randomly does not explore too much of the space.
# Nevertheless: We need this only for the scaler, as RBFSampler's fit function does
# not care about the data you pass to it.
observation_samples = []
for i in range(RBF_SAMPLING):
env.reset()
done = False
while not done:
observation, _, done, _ = env.step(env.action_space.sample())
observation_samples.append(observation)
print("Done. Amount of observation samples: ", len(observation_samples))
# create scaler and fit it to the sampled observation space
scaler = StandardScaler()
scaler.fit(observation_samples)
# the RBF network is built like this: create as many RBFSamplers as RBF_GAMMA_COUNT
# and do so by setting the "width" parameter GAMMA of the RBFs as a linear interpolation
# between RBF_GAMMA_MIN and RBF_GAMMA_MAX
gammas = np.linspace(RBF_GAMMA_MIN, RBF_GAMMA_MAX, RBF_GAMMA_COUNT)
models = [RBFSampler(n_components=RBF_EXEMPLARS, gamma=g) for g in gammas]
# we will put all these RBFSamplers into a FeatureUnion, so that our Linear Regression
# can regard them as one single feature space spanning over all "Gammas"
transformer_list = []
for model in models:
model.fit([observation_samples[0]]) # RBFSampler just needs the dimensionality, not the data itself
transformer_list.append((str(model), model))
rbfs = FeatureUnion(transformer_list) #union of all RBF exemplar's output
# we use one Linear Regression per possible action to model the Value Function for this action,
# so rbf_net is a list; SGDRegressor allows step-by-step regression using partial_fit, which
# is exactly what we need to learn
rbf_net = [SGDRegressor(eta0=ALPHA, power_t=ALPHA_DECAY, learning_rate='invscaling', max_iter=5, tol=float("-inf"))
for i in range(len(env_actions))]
# transform the 4 features (Cart Position, Cart Velocity, Pole Angle and Pole Velocity At Tip)
# into RBF_EXEMPLARS x RBF_GAMMA_COUNT distances from the RBF centers ("Exemplars")
def transform_s(s):
return rbfs.transform(scaler.transform(np.array(s).reshape(1, -1)))
# SGDRegressor expects a vector, so we need to transform our action, which is 0 or 1 into a vector
def transform_val(val):
return np.array([val]).ravel()
# SGDRegressor needs "partial_fit" to be called before the first "predict" is called
# so let's do that for all actions (aka entries in the rbf_net list) with random (dummy) values
# for the state and the Value Function
for net in rbf_net:
random_val_func = np.random.rand() * np.random.randint(1, 100) # random float 0 < x < 100
net.partial_fit(transform_s(env.reset()), transform_val(random_val_func))
# get the result of the Value Function for action a taken in state s
def get_Q_s_a(s, a):
return rbf_net[a].predict(transform_s(s))
# learn Value Function for action a in state s
def set_Q_s_a(s, a, val):
rbf_net[a].partial_fit(transform_s(s), transform_val(val))
# find the best action to take in state s
def max_Q_s(s):
max_val = float("-inf")
max_idx = -1
for a in env_actions:
val = get_Q_s_a(s, a)
if val > max_val:
max_val = val
max_idx = a
return max_idx, max_val
t = 1.0 # used to decay epsilon
won_last_interval = 0 # for statistical purposes
steps_last_interval = 0 # dito
print("\nLearn:")
print("======\n")
print("Episode\t\t\tAvg. Steps\t\tepsilon")
for episode in range(LEARN_EPISODES + 1):
# let epsilon decay over time
if episode % EPSILON_DECAY_m == 0:
t += EPSILON_DECAY_t
# each episode starts again at the begining
s = env.reset()
a = np.random.choice(env_actions)
episode_step_count = 0
done = False
while not done:
episode_step_count += 1
steps_last_interval += 1
# epsilon-greedy: do a random move with the decayed EPSILON probability
p = np.random.random()
eps = EPSILON / t
if p > (1 - eps):
a = np.random.choice(env_actions)
# exploit or explore and collect reward
observation, r, done, _ = env.step(a)
s2 = observation
# Q-Learning
old_qsa = get_Q_s_a(s, a)
# if this is not the terminal position (which has no actions), then we can proceed as normal
if not done:
# "Q-greedily" grab the gain of the best step forward from here
a2, max_q_s2a2 = max_Q_s(s2)
# else change the rewards in the terminal position to have a clearer bias for "longevity"
else:
if episode_step_count < 100:
r = -100
elif episode_step_count < 200:
r = 100
elif episode_step_count < 300:
r = 150
elif episode_step_count < 400:
r = 200
elif episode_step_count <= 500:
r = 300
else:
r = 1000
a2 = a
max_q_s2a2 = 0 # G (future rewards) of terminal position is 0 although the reward r is high
# learn the new value for the Value Function
new_value = old_qsa + (r + GAMMA * max_q_s2a2 - old_qsa)
set_Q_s_a(s, a, new_value)
# next state = current state, next action is the "Q-greedy" best action
s = s2
a = a2
# every PROBEth episode: print status info
if episode % PROBE == 0:
if episode == 0:
probe_divisor = 1
else:
probe_divisor = PROBE
print("%d\t\t\t%0.2f\t\t\t%0.4f" % (episode, steps_last_interval / probe_divisor, eps))
won_last_interval = 0
steps_last_interval = 0
# now, after we learned how to balance the pole: test it and use the visual output of Gym
print("\nTest:")
print("=====\n")
# use DISTURB_PROB to switch a probabilistic system disturbance:
# if you set it to a value larger then 0.0, the system will be disturbed
# with this probability for an amount of steps given by DISTURB_DURATION
DISTURB_PROB = 0.00
DISTURB_DURATION = 5 # needs to be at least 1; suggestion: try 5 in combination with 0.01 probability
if DISTURB_PROB > 0.0:
print("Disturb-Mode ON! Probability = %0.4f Duration = %d\n" % (DISTURB_PROB, DISTURB_DURATION))
print("Episode\tSteps\tResult")
all_steps = 0
for episode in range(TEST_EPISODES):
observation = env.reset()
env.render()
done = False
won = False
episode_step_count = 0
dist_ongoing = 0
# we are ignoring Gym's "done" function, so that we can run the system longer
while observation[0] > -2.4 and observation[0] < 2.4 and episode_step_count < 5000:
episode_step_count += 1
all_steps += 1
a, _ = max_Q_s(observation)
if DISTURB_PROB > 0.0:
if dist_ongoing == 0 and np.random.rand() > (1.0 - DISTURB_PROB):
dist_ongoing = DISTURB_DURATION
if dist_ongoing > 0:
print("\tstep #%d:\tdisturbance ongoing: %d" % (episode_step_count, dist_ongoing))
dist_ongoing -= 1
a = np.random.choice(env_actions)
observation, _, done, _ = env.step(a)
env.render()
print("%d\t\t%d\t\t" % (episode, episode_step_count))
print("Avg. Steps: %0.2f" % (all_steps / TEST_EPISODES))
env.close()