-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgrid_world.py
115 lines (99 loc) · 3 KB
/
grid_world.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
import numpy as np
class Grid: # Environment
def __init__(self, width, height, start):
self.width = width
self.height = height
self.i = start[0]
self.j = start[1]
def set(self, rewards, actions):
# rewards should be a dict of: (i, j): r (row, col): reward
# actions should be a dict of: (i, j): A (row, col): list of possible actions
self.rewards = rewards
self.actions = actions
def set_state(self, s):
self.i = s[0]
self.j = s[1]
def get_state(self):
return (self.i, self.j)
def current_state(self):
return (self.i, self.j)
def is_terminal(self, s):
return s not in self.actions
def move(self, action):
# check if legal move first
if action in self.actions[(self.i, self.j)]:
if action == 'U':
self.i -= 1
elif action == 'D':
self.i += 1
elif action == 'R':
self.j += 1
elif action == 'L':
self.j -= 1
# return a reward (if any)
return self.rewards.get((self.i, self.j), 0)
def undo_move(self, action):
# these are the opposite of what U/D/L/R should normally do
if action == 'U':
self.i += 1
elif action == 'D':
self.i -= 1
elif action == 'R':
self.j -= 1
elif action == 'L':
self.j += 1
# raise an exception if we arrive somewhere we shouldn't be
# should never happen
assert(self.current_state() in self.all_states())
def game_over(self):
# returns true if game is over, else false
# true if we are in a state where no actions are possible
return (self.i, self.j) not in self.actions
def all_states(self):
# possibly buggy but simple way to get all states
# either a position that has possible next actions
# or a position that yields a reward
return set(self.actions.keys()) | set(self.rewards.keys())
def standard_grid():
# define a grid that describes the reward for arriving at each state
# and possible actions at each state
# the grid looks like this
# x means you can't go there
# s means start position
# number means reward at that state
# . . . 1
# . x . -1
# s . . .
g = Grid(3, 4, (2, 0))
rewards = {(0, 3): 1, (1, 3): -1}
actions = {
(0, 0): ('D', 'R'),
(0, 1): ('L', 'R'),
(0, 2): ('L', 'D', 'R'),
(1, 0): ('U', 'D'),
(1, 2): ('U', 'D', 'R'),
(2, 0): ('U', 'R'),
(2, 1): ('L', 'R'),
(2, 2): ('L', 'R', 'U'),
(2, 3): ('L', 'U'),
}
g.set(rewards, actions)
return g
def negative_grid(step_cost=-0.1):
# in this game we want to try to minimize the number of moves
# so we will penalize every move
g = standard_grid()
g.rewards.update({
(0, 0): step_cost,
(0, 1): step_cost,
(0, 2): step_cost,
(1, 0): step_cost,
(1, 2): step_cost,
(2, 0): step_cost,
(2, 1): step_cost,
(2, 2): step_cost,
(2, 3): step_cost,
})
return g