-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbcmdp_init.m
132 lines (128 loc) · 4.09 KB
/
bcmdp_init.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
%bcmdp_init.m
%initialize all bc and mdp structs, cast bcmdp_* functions as * functions
% clear
%%settings for bounded confidence model/dynamics
%number of agents
bc.n = 2;
%right and left confidence bounds
bc.eps_r = 0.15;
bc.eps_l = bc.eps_r;
% % bc.eps_l = 0.25;
%probability of no msg (or it might be prob of a msg going through...)
bc.p = 0.8;%0.2;
%probability of wrong msg (round)
bc.q = 0.05;%0.05;
%whether initial conditions are uniform linspace 'unif' or random 'rand'
%-- have to do random start, since the first m agents are controlled
%also include 'fixrand'--predetermined random starts for each episode
bc.init = 'rand';
%time horizon for each episode
bc.T = 500;
%budget (amount of total control effort)
% % bc.beta = 0.2/bc.n; %usually use 0.1, 0.05, 0.02
bc.beta = 0.066;
%%settings for mdp
%number of agents to control
mdp.m = 1;
%number of independent agents, only works if m = 1, if using coordinated
%agents, set MAS to 1
mdp.MAS = 1;
if mdp.MAS > 1 && mdp.m > 1
error('Using multiple independent agents requires mdp.m = 1');
end
%number of bins with which to discretize the spans [del,eps_r] and
%[-eps_l,-del]; total number of features is 2*n_bins+3
mdp.n_bins = 13;
mdp.n_episodes = 15000;
%whether we are learning a weights/whatever, trying out a given w, or just
%checking the dynamics of the system: 'learn','policy', or 'dynamics
mdp.run_type = 'learn';
%what kind of reward function: 'L1', 'L2', or 'const'
mdp.reward_fn = 'const';
%goal state, a function of the mdp, not the dynamics
mdp.goal = 0.75;
%if everyone is within this distance of the goal state, we're done
mdp.stop_thresh = 0.05;
%eps: prob of random action
mdp.eps = 0.2;
%corresponds to weight on each element of the reward function
%distance from goal is always weight 1 (implicit), second is the diameter
%of the agents' belief space, third is the variance in belief values
%these encode deviance/spread
mdp.J = [0 0];
%decay rate of eligility traces (if used)
mdp.lambda = 1;
%trace type, 'accumulating' or 'replacing'
mdp.trace_type = 'replacing';
%discount factor
mdp.gamma = 1;
%learning rates
mdp.alpha_w = 0.03;
mdp.alpha_r = 0.1;%0.001; %use relative reward
mdp.alpha_u = 0.03;
%more learning rates...
%
%if using softmax, need to define temperature, tau
mdp.tau = 25;
%what kind of resource allocation, uniform or dirichlet
mdp.resource_alloc = 'unif';
%number of tiles if using tile coding
mdp.num_tiles = 10;
%action space, each action is indexed by cell (e.g. 1,2,...,9)
as_ = repmat([-1 0 1],1,mdp.m);
A_ = unique(combntns(as_,mdp.m),'rows');
% A = {[0 1],[1 1],[-1 1]};
A = A_';
% A = cell(size(A_,1),1);
% for i = 1:size(A_,1)
% A{i} = [A_(i,:)'];%;zeros(bc.n-mdp.m,1)];
% end
mdp.A = A;
%===DEPRICATED BY REALIZATION OF MDP.A===
%indicator vector (for AND) of which agents are controllable
% mdp.U = zeros(bc.n,1);
%========================================
%max number of features;
mdp.M = 10000;
%the width of the feature bin that corresponds to something being 'zero'
mdp.del = 0.08;
% 'explicit' or 'hist' or 'tile' for now
mdp.feature_type = 'explicit_tile';
%how to handle the histogram-like 'binary','sqrt', anything else means no
%scaling
mdp.feature_scaling = 'unscaled';
% 'softmax' or 'hardmax' for now
mdp.action_type = 'hardmax';
if ~strcmpi(mdp.run_type,'learn')
mdp.n_episodes = 1;
end
%what learning algorithm to use: 'q','sarsa','tdc','gtd2','copdac-gq'
mdp.learning_rule = 'tdc';
%what norm to plot for plotnorms and saving norms
mdp.plot_type = 'L2';
mdp.display_episode_interval = 50;
if strcmpi(bc.init,'fixrand')
mdp.s0 = rand(bc.n,mdp.n_episodes);
end
num_runs = 1;
% bcmdp_run;
%num_runs is not a property of the bc dynamics or the mdp... it's a
%metavariable
% num_runs = 1;
histories = cell(num_runs,1);
%run the program
total_time = 0;
for run = 1:num_runs
bcmdp_init_history;
t0 = tic;
bcmdp_main;
tf = toc(t0);
total_time = total_time + tf;
disp(['Run ' int2str(run) ' Complete! It took ' int2str(tf) ' seconds!']);
histories{run} = history;
end
if num_runs > 1
disp(['All ' int2str(num_runs) ' runs took ' int2str(total_time) ' seconds!']);
figure;
bcmdp_plot_avg(histories,bc,mdp);
end