-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcartpole_static.py
163 lines (96 loc) · 5.58 KB
/
cartpole_static.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import gymnasium as gym
import numpy as np
from tqdm import trange
import helper as help
""" defining file controls and initializing the environment """
# visual_testing:
## True: displays rendered episodes of the trained model for user to analyze
## False: skips this step
visual_testing = True
# initializing the environment
env = gym.make("CartPole-v1", max_episode_steps=2000)
""" discretizing the observation space """
# cart position range
cart_pos_space = np.linspace(-2.4, 2.4, 10)
# in practice cart velocity can range from (-4, 4)
cart_velo_space = np.linspace(-4, 4, 10)
# pole angle range
pole_ang_space = np.linspace(-.2095, .2095, 10)
# pole velocity can range from (-4, 4)
pole_velo_space = np.linspace(-4, 4, 10)
""" model parameters """
training_episodes = 100000
evaluation_episodes = 1000
visual_testing_episodes = 3
epsilon_decay = -.000009
max_epsilon = 1
min_epsilon = 0.1
# add to arrays to test/compare different rates. these were the optimal static rates I found though they did not perform well.
learning_rates = [0.45]
discount_rates = [0.9]
training_seeds = list(map(int, np.random.randint(0, 1000000, training_episodes)))
evaluation_seeds = list(map(int, np.random.randint(0, 1000000, evaluation_episodes)))
visual_testing_seeds = list(map(int, np.random.randint(0, 1000000, visual_testing_episodes)))
""" Core Model Implementation Function handling Training, Visualizing and Evaluating
Returns: (q_table, [episode scores]) """
# **kwargs apply to model training (train=True)
### **kwargs to define: max_epsilon, min_epsilon, epsilon_decay, learning_rate, discount_rate
def Play(q_table, episodes, seeds, env=None, train=False, visual=False, print_progress=True, **kwargs):
track = []
# if visual, create new rendered environment for analysis. otherwise proceed.
if visual: env = gym.make("CartPole-v1", render_mode = 'human', max_episode_steps=2000)
# loop over the number of designated epsiodes
for episode in trange(episodes):
# if training, reduce epsilon. this reduction does not apply to train=False cases
if train:
epsilon = kwargs['min_epsilon'] + (kwargs['max_epsilon'] - kwargs['min_epsilon'])*np.exp(kwargs['epsilon_decay']*episode)
# printing mean of prior 100 episodes every 1,000 episodes to track model progress
if print_progress and episode != 0 and episode % 1000 == 0:
print('Episode ' + str(episode) + '- Prior 100 Mean: ' + str(np.mean(track[episode - 100:])))
# if training, printing epsilon every 1,000 episodes for reference
if train and episode != 0 and episode % 1000 == 0:
print('Episode ' + str(episode) + ' Epsilon: ' + str(epsilon))
# in all cases, reset environment, init cumulative reward tracker
state, _ = env.reset(seed=seeds[episode])
binned_state = help.Bin(state, cart_pos_space, cart_velo_space, pole_ang_space, pole_velo_space)
terminated, truncated = False, False
cumulative_reward = 0
# loop while env is not terminated or truncated
while not(terminated or truncated):
# determine action via epsilon greedy / greedy function, take action
if train: action = help.EpsilonGreedy(epsilon, env, binned_state , q_table)
else: action = help.Greedy(binned_state, q_table)
new_state, reward, terminated, truncated, _ = env.step(action)
binned_new_state = help.Bin(new_state, cart_pos_space, cart_velo_space, pole_ang_space, pole_velo_space)
# if training, update q_table
if train: help.UpdateQ(q_table, kwargs['learning_rate'], kwargs['discount_rate'], binned_state, binned_new_state, action, reward)
# update cumulative reward, set binned_state = binned_new_state
cumulative_reward += 1
binned_state = binned_new_state
track.append(cumulative_reward)
return (q_table, track)
""" defining main function """
def main():
trained_tables = []
# loop through learning rates and discount rates to compare parameter performance.
# if only one value input for both learning and discount rates, only one loop.
for l in learning_rates:
for d in discount_rates:
# initialize q_table
q_table = help.InitQTable(env, cart_pos_space, cart_velo_space, pole_ang_space, pole_velo_space)
# train model
trained_table = Play(q_table, training_episodes, training_seeds, env=env, train=True, max_epsilon=max_epsilon, min_epsilon=min_epsilon, epsilon_decay=epsilon_decay, learning_rate=l, discount_rate=d)[0]
trained_tables.append(trained_table)
# evaluate greedy policy for all trained models
scores = []
for table in trained_tables:
scores.append(np.mean(Play(table, evaluation_episodes, evaluation_seeds, env=env)[1]))
# displaying best performing model for visual analysis if visual_testing = True
if visual_testing:
Play(trained_tables[np.argmax(scores)], visual_testing_episodes, visual_testing_seeds, visual=True, print_progress=False)
# reshaping and printing scores to display learning and discount parameter performance
## learning rates correspond to rows, discount rates to columns
print(np.reshape(scores, (len(learning_rates), len(discount_rates))))
print(np.unravel_index(np.argmax(scores), (len(learning_rates), len(discount_rates))))
if __name__ == '__main__':
main()