forked from iadine/PPR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmdp_policy_iteration.r
85 lines (69 loc) · 2.39 KB
/
mdp_policy_iteration.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
mdp_policy_iteration <- function(P, R, discount, policy0, max_iter, eval_type) {
start<-as.POSIXlt(Sys.time())
# VERBOSE STUFF
# check of arguments
if ( discount <= 0 | discount > 1 ) {
print('--------------------------------------------------------')
print('MDP Toolbox ERROR: Discount rate must be in ]0; 1]')
print('--------------------------------------------------------')
} else if ( nargs() > 3 & is.list(P) & ifelse(!missing(policy0), length(policy0) != dim(P[[1]])[1], F) ) {
print('--------------------------------------------------------')
print('MDP Toolbox ERROR: policy must have the same dimension as P')
print('--------------------------------------------------------')
} else if ( nargs() > 3 & !is.list(P) & ifelse(!missing(policy0), length(policy0) != dim(P)[1], F) ) {
print('--------------------------------------------------------')
print('MDP Toolbox ERROR: policy must have the same dimension as P')
print('--------------------------------------------------------')
} else if ( nargs() > 4 & ifelse(!missing(max_iter), max_iter <= 0, F) ) {
print('--------------------------------------------------------')
print('MDP Toolbox ERROR: The maximum number of iteration must be upper than 0')
print('--------------------------------------------------------')
} else {
# Beware global scope!
if (is.list(P)) {
S <- dim(P[[1]])[1]
A <- length(P)
} else {
S <- dim(P)[1]
A <- dim(P)[3]
}
PR <- mdp_computePR(P,R)
# initialization of optional arguments
if (nargs() < 6) {
eval_type <- 0
}
if (nargs() < 5) {
max_iter <- 1000
}
if (nargs() < 4) {
# initialization of policy:
# the one wich maximizes the expected immediate reward
bellman <- mdp_bellman_operator(P,PR,discount,numeric(S))
Vunused <- bellman[[1]]
policy0 <- bellman[[2]]
}
iter <- 0
policy <- policy0
is_done <- F
while (!is_done) {
iter <- iter + 1
if (eval_type == 0) {
V <- mdp_eval_policy_matrix(P,PR,discount,policy)
} else {
V <- mdp_eval_policy_iterative(P,PR,discount,policy)
}
bellman <- mdp_bellman_operator(P,PR,discount,V)
Vnext <- bellman[[1]]
policy_next <- bellman[[2]]
n_different <- sum(policy_next != policy)
# VERBOSE STUFF
if ( setequal(policy_next,policy) | iter == max_iter ) {
is_done <- T
} else {
policy <- policy_next
}
}
end <-as.POSIXlt(Sys.time())
return(list("V"=V, "policy"=policy, "iter"=iter, "time"=end-start))
}
}