-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmdp_eval_policy_iterative.r
103 lines (93 loc) · 4.33 KB
/
mdp_eval_policy_iterative.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# function Vpolicy = mdp_eval_policy_iterative(P, R, discount, policy, V0, epsilon, max_iter)
mdp_eval_policy_iterative <- function(P, R, discount, policy, V0, epsilon, max_iter) {
# mdp_eval_policy_iterative Policy evaluation using iteration.
# Arguments -------------------------------------------------------------
# Let S = number of states, A = number of actions
# P(SxSxA) = transition matrix
# P could be an array with 3 dimensions or
# a cell array (1xS), each cell containing a matrix possibly sparse
# R(SxSxA) or (SxA) = reward matrix
# R could be an array with 3 dimensions (SxSxA) or
# a cell array (1xA), each cell containing a sparse matrix (SxS) or
# a 2D array(SxA) possibly sparse
# discount = discount rate in ]0; 1[
# policy(S) = a policy
# V0(S) = starting value function, optional (default : zeros(S,1))
# epsilon = epsilon-optimal policy search, upper than 0,
# optional (default : 0.0001)
# max_iter = maximum number of iteration to be done, upper than 0,
# optional (default : 10000)
# Evaluation -------------------------------------------------------------
# Vpolicy(S) = value function, associated to a specific policy
# --------------------------------------------------------------------------
# In verbose mode, at each iteration, displays the condition which stopped iterations:
# epsilon-optimum value function found or maximum number of iterations reached.
# VERBOSE STUFF
# check of arguments
if ( discount <= 0 | discount > 1 ) {
print('--------------------------------------------------------')
print('MDP Toolbox ERROR: Discount rate must be in ]0; 1]')
print('--------------------------------------------------------')
} else if ( ifelse(is.list(P), length(policy) != dim(P[[1]])[1], F) ) {
print('--------------------------------------------------------')
print('MDP Toolbox ERROR: policy must have the same dimension as P')
print('--------------------------------------------------------')
} else if ( ifelse(!is.list(P), length(policy) != dim(P)[1], F) ) {
print('--------------------------------------------------------')
print('MDP Toolbox ERROR: policy must have the same dimension as P')
print('--------------------------------------------------------')
} else if ( is.list(P) & nargs() > 4 & ifelse(!missing(V0), ifelse(length(V0) != dim(P[[1]])[1], T, F), F) ) {
print('--------------------------------------------------------')
print('MDP Toolbox ERROR: V0 must have the same dimension as P')
print('--------------------------------------------------------')
} else if ( !is.list(P) & nargs() > 4 & ifelse(!missing(V0), ifelse(length(V0) != dim(P)[1], T, F), F) ) {
print('--------------------------------------------------------')
print('MDP Toolbox ERROR: V0 must have the same dimension as P')
print('--------------------------------------------------------')
} else if ( nargs() > 5 & ifelse(!missing(epsilon), ifelse(epsilon < 0, T, F), F) ) {
print('--------------------------------------------------------')
print('MDP Toolbox ERROR: epsilon must be upper than 0')
print('--------------------------------------------------------')
} else if ( nargs() > 6 & ifelse(!missing(max_iter), ifelse(max_iter <= 0, T, F), F) ) {
print('--------------------------------------------------------')
print('MDP Toolbox ERROR: The maximum number of iteration must be upper than 0')
print('--------------------------------------------------------')
} else {
# initialization of optional arguments
if (is.list(P)) {
S <- dim(P[[1]])[1]
} else {
S <- dim(P)[1]
}
if (nargs() < 5) {
V0 <- numeric(S)
}
if (nargs() < 6) {
epsilon <- 0.0001
}
if (nargs() < 7) {
max_iter <- 10000
}
compute <- mdp_computePpolicyPRpolicy(P, R, policy)
Ppolicy <- compute[[1]]
PRpolicy <- compute[[2]]
iter <- 0
Vpolicy <- V0
is_done <- F
while (!is_done) {
iter <- iter + 1
Vprev <- Vpolicy
Vpolicy <- PRpolicy + discount * Ppolicy %*% Vprev
variation <- max(abs(Vpolicy - Vprev))
if (variation < epsilon) {
is_done <- T
# VERBOSE STUFF
print('MDP Toolbox: iterations stopped, epsilon-optimal value function')
} else if (iter == max_iter) {
is_done <- T
print(paste('MDP Toolbox: iterations stopped by maximum number of iteration condition', max_iter))
}
}
}
return(Vpolicy)
}