-
Notifications
You must be signed in to change notification settings - Fork 21
/
main.py
134 lines (112 loc) · 5.33 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# main function that sets up environments
# and performs training loop
from collections import deque
from maddpg_agent import Agent
import numpy as np
import os
import time
import torch
from unityagents import UnityEnvironment
N_EPISODES = 2000
SOLVED_SCORE = 0.5
CONSEC_EPISODES = 100
PRINT_EVERY = 10
ADD_NOISE = True
## Helper functions
def seeding(seed=1):
np.random.seed(seed)
torch.manual_seed(seed)
def get_actions(states, add_noise):
actions = [agent.act(states, add_noise) for agent in agents]
# flatten action pairs into a single vector
return np.reshape(actions, (1, num_agents*action_size))
def reset_agents():
for agent in agents:
agent.reset()
def learning_step(states, actions, rewards, next_states, done):
for i, agent in enumerate(agents):
agent.step(states, actions, rewards[i], next_states, done, i)
## Training loop
# start environment
env = UnityEnvironment(file_name='Tennis.app')
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)
# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)
# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like: \n{}\n'.format(states[0]))
# initialize agents
agents = [Agent(state_size, action_size, num_agents=1, random_seed=1) for i in range(num_agents)]
# initialize scoring
scores_window = deque(maxlen=CONSEC_EPISODES)
scores_all = []
moving_average = []
best_score = -np.inf
best_episode = 0
already_solved = False
for i_episode in range(1, N_EPISODES+1):
env_info = env.reset(train_mode=True)[brain_name] # reset the environment
states = np.reshape(env_info.vector_observations, (1,num_agents*state_size)) # flatten states
reset_agents()
scores = np.zeros(num_agents)
while True:
actions = get_actions(states, ADD_NOISE) # choose agent actions and flatten them
env_info = env.step(actions)[brain_name] # send both agents' actions to the environment
next_states = np.reshape(env_info.vector_observations, (1, num_agents*state_size)) # flatten next states
rewards = env_info.rewards # get rewards
done = env_info.local_done # see if the episode finished
learning_step(states, actions, rewards, next_states, done) # perform the learning step
scores += np.max(rewards) # update scores with best reward
states = next_states # roll over states to next time step
if np.any(done): # exit loop if episode finished
break
ep_best_score = np.max(scores) # record best score for episode
scores_window.append(ep_best_score) # add score to recent scores
scores_all.append(ep_best_score) # add score to histor of all scores
moving_average.append(np.mean(scores_window)) # recalculate moving average
# save best score
if ep_best_score > best_score:
best_score = ep_best_score
best_episode = i_episode
# print results
if i_episode % PRINT_EVERY == 0:
print('Episodes {:0>4d}-{:0>4d}\tMax Reward: {:.3f}\tMoving Average: {:.3f}'.format(
i_episode-PRINT_EVERY, i_episode, np.max(scores_all[-PRINT_EVERY:]), moving_average[-1]))
# determine if environment is solved and keep best performing models
if moving_average[-1] >= SOLVED_SCORE:
if not already_solved:
print('<-- Environment solved in {:d} episodes! \
\n<-- Moving Average: {:.3f} over past {:d} episodes'.format(
i_episode-CONSEC_EPISODES, moving_average[-1], CONSEC_EPISODES))
already_solved = True
# save weights
torch.save(agent_0.actor_local.state_dict(), 'models/checkpoint_actor_0.pth')
torch.save(agent_0.critic_local.state_dict(), 'models/checkpoint_critic_0.pth')
torch.save(agent_1.actor_local.state_dict(), 'models/checkpoint_actor_1.pth')
torch.save(agent_1.critic_local.state_dict(), 'models/checkpoint_critic_1.pth')
elif ep_best_score >= best_score:
print('<-- Best episode so far!\
\nEpisode {:0>4d}\tMax Reward: {:.3f}\tMoving Average: {:.3f}'.format(
i_episode, ep_best_score, moving_average[-1]))
# save weights
torch.save(agent_0.actor_local.state_dict(), 'models/checkpoint_actor_0.pth')
torch.save(agent_0.critic_local.state_dict(), 'models/checkpoint_critic_0.pth')
torch.save(agent_1.actor_local.state_dict(), 'models/checkpoint_actor_1.pth')
torch.save(agent_1.critic_local.state_dict(), 'models/checkpoint_critic_1.pth')
# stop training if model stops improving
elif (i_episode-best_episode) >= 200:
print('<-- Training stopped. Best score not matched or exceeded for 200 episodes')
break
else:
continue
env.close()