-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathtrain_svm.m
275 lines (240 loc) · 10 KB
/
train_svm.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
function cf = train_svm(param,X,clabel)
% Trains a support vector machine (SVM).
%
% Note: It is recommended to demean (X = demean(X,'constant')) or z-score
% (X = zscore(X)) the data first to speed up the optimization.
%
% Usage:
% cf = train_svm(param,X,clabel)
%
%Parameters:
% X - [samples x features] matrix of training samples -OR-
% [samples x samples] kernel matrix
% clabel - [samples x 1] vector of class labels
%
% param - struct with hyperparameters:
% .c - regularization hyperparameter controlling the magnitude
% of regularization. If a single value is given, it is
% used for regularization. If a vector of values is given,
% cross-validation is used to test all the values
% in the vector and the best one is selected.
% If set to 'auto', a default search grid is used to
% automatically determine the best c (default
% 'auto').
% .kernel - kernel function:
% 'linear' - linear kernel, trains a linear SVM
% ker(x,y) = x' * y
% 'rbf' - radial basis function or Gaussian kernel
% ker(x,y) = exp(-gamma * |x-y|^2);
% 'polynomial' - polynomial kernel
% ker(x,y) = (gamma * x' * y + coef0)^degree
% Alternatively, a custom kernel can be provided if there
% is a function called *_kernel is in the MATLAB path,
% where "*" is the name of the kernel (e.g. rbf_kernel).
%
% If a precomputed kernel matrix is provided as X, set
% param.kernel = 'precomputed'.
%
% .prob - if 1, decision values are returned as probabilities. If
% 0, the decision values are simply the distance to the
% hyperplane. Calculating probabilities takes more time
% and memory so don't use this unless needed. A Platt
% approximation using an external function (obtained from
% http://www.work.caltech.edu/~htlin/program/libsvm/) is
% used to estimated probabilities.
%
% Note: c is implemented analogous to the classical SVM implementations,
% see libsvm and liblinear. It is roughly reciprocally related to the
% lambda parameter used in LDA/logistic regression, ie c = 1/lambda.
%
% HYPERPARAMETERS for specific kernels:
%
% gamma - (kernel: rbf, polynomial) controls the 'width' of the
% kernel. If set to 'auto', gamma is set to 1/(nr of features)
% or 1 if a pre-computed kernel matrix is provided
% (default 'auto')
% coef0 - (kernel: polynomial) constant added to the polynomial
% term in the polynomial kernel. If 0, the kernel is
% homogenous (default 1)
% degree - (kernel: polynomial) degree of the polynomial term. A too
% high degree makes overfitting likely (default 2)
%
% TUNING: Hyperparameters can be tuned by setting a range instead of a
% single value. For instance, if param.gamma = [10e-1, 1, 10e1] a
% cross-validation is performed where each of the parameters is tested and
% the best parameter is chosen. If multiple parameters are set for tuning,
% a multi-dimensional search grid is set up where all combinations of
% parameters are tested. Note: if a precomputed kernel is provided, only c
% can be tuned.
%
% Further parameters (that usually do not need to be changed):
% bias - if >0 augments the data with a bias term equal to the
% value of bias: X <- [X; bias] and augments the weight
% vector with the bias variable w <- [w; b].
% If 0, the bias is assumed to be 0. If set to 'auto', the
% bias is 1 for linear kernel and 0 for non-linear
% kernel (default 'auto'). This is because for non-linear
% kernels, a bias is usally not needed (Kecman 2001, p.182)
% k - the number of folds in the k-fold cross-validation for
% the lambda search (default 3)
% plot - if a grid search for c is performed, produces diagnostic
% plots including the regularization path and
% cross-validated accuracy as a function of c (default
% 0)
%
%Further parameters controlling the Dual Coordinate Descent optimization:
% shrinkage_multiplier - multiplier that controls the Dual Coordinate
% Descent algorithm with active set strategy. If the
% multiplier is < 1, the active set is shrunk more
% aggressively, potentially leading to speed ups (default
% 1)
% tolerance - controls the stopping criterion. If the change in
% projected gradient is below tolerance, the algorithm
% terminates
%
%Output:
% cf - struct specifying the classifier with the following fields:
% w - normal to the hyperplane (for linear SVM)
% b - bias term, setting the threshold (for linear SVM with bias)
% alpha - dual vector specifying the weighting of training samples
% for evaluating the classifier. For alpha > 0 a particular
% sample is a support vector (for kernel SVM)
%
% IMPLEMENTATION DETAILS:
% A Dual Coordinate Descent algorithm is used to find the optimal alpha
% (weights on the samples), implemented in DualCoordinateDescent.
%
% REFERENCES:
% V Kecman (2001). Learning and Soft Computing: Support Vector Machines,
% Neural Networks, and Fuzzy Logic Models. MIT Press
%
% Lin, Lin & Weng (2007). A note on Platt's probabilistic outputs for
% support vector machines. Machine Learning, 68(3), 267-276
% (c) Matthias Treder
N = size(X,1);
% Vector of 1's we need for optimization
ONE = ones(N,1);
% Make sure labels come as column vector
clabel = double(clabel(:));
% Need class labels 1 and -1 here
clabel(clabel == 2) = -1;
% indicates whether kernel matrix has been precomputed
is_precomputed = strcmp(param.kernel,'precomputed');
%% Set kernel hyperparameter defaults
if ischar(param.gamma) && strcmp(param.gamma,'auto') && ~is_precomputed
param.gamma = 1/size(X,2);
end
%% Set search grid for c
if ischar(param.c) && strcmp(param.c,'auto')
param.c = logspace(-2,2,5);
end
%% Bias
if ischar(param.bias) && strcmp(param.bias,'auto')
if strcmp(param.kernel,'linear')
param.bias = 1;
else
param.bias = 0;
end
end
% Augment X with bias
if param.bias > 0
X = cat(2,X, ones(N,1) * param.bias );
end
%% Check if kernel hyperparameters need to be tuned
tune_kernel_parameters = {};
if ~is_precomputed
% Kernel function
kernelfun = eval(['@' param.kernel '_kernel']);
tmp = {};
switch(param.kernel)
case 'rbf'
if numel(param.gamma)>1, tmp = {'gamma' param.gamma}; end
case 'polynomial'
if numel(param.gamma)>1, tmp = {'gamma' param.gamma}; end
if numel(param.coef0)>1, tmp = [tmp(:)' {'coef0'} {param.coef0}]; end
if numel(param.degree)>1, tmp = [tmp(:)' {'degree'} {param.degree}]; end
end
tune_kernel_parameters = tmp;
end
%% Compute and regularize kernel
if isempty(tune_kernel_parameters)
if is_precomputed
K = X;
else
% Compute kernel matrix
K = kernelfun(param, X);
end
% Regularize
if param.regularize_kernel > 0
K = K + param.regularize_kernel * eye(size(X,1));
end
% Create a copy of K wherein class labels 1 and -1 are absorbed in the
% kernel matrix. This is useful for optimization.
Q = K .* (clabel * clabel');
end
%% Optimize hyperparameters using nested cross-validation
if numel(param.c)>1 || ~isempty(tune_kernel_parameters)
tune_hyperparameter_svm
end
%% Train classifier on the full training data
% Solve the dual problem and obtain alpha
alpha = DualCoordinateDescent(Q, param.c, ONE, param.tolerance, param.shrinkage_multiplier);
%% Set up classifier struct
cf= [];
cf.kernel = param.kernel;
cf.c = param.c;
cf.alpha = alpha;
cf.gamma = param.gamma;
cf.bias = param.bias;
cf.prob = param.prob;
if strcmp(param.kernel,'linear')
% Calculate linear weights w and bias b from alpha
cf.w = X' * (cf.alpha .* clabel(:));
if param.bias > 0
cf.b = cf.w(end);
cf.w = cf.w(1:end-1);
% Bias term needs correct scaling
cf.b = cf.b * param.bias;
else
cf.b = 0;
end
else
% Nonlinear kernel: also need to save support vectors
cf.support_vector_indices = find(cf.alpha>0);
% Class labels for the support vectors
cf.y = clabel(cf.support_vector_indices);
% For convenience we save the product alpha * y for the support vectors
cf.alpha_y = cf.alpha(cf.support_vector_indices) .* cf.y(:);
if param.bias > 0
%%% TODO: this part might need checking
cf.b = 0;
% % remove bias part from support vectors
% cf.support_vectors = cf.support_vectors(:,1:end-1);
else
cf.b = 0;
end
if ~is_precomputed
cf.support_vectors = X(cf.support_vector_indices,:);
cf.kernelfun = kernelfun;
end
% Save hyperparameters
cf.gamma = param.gamma;
cf.coef0 = param.coef0;
cf.degree = param.degree;
end
if cf.prob == 1
% Invoke external function platt.m. It calculates parameters A and B of
% the sigmoid that models the probabilities. The method is known as
% Platt's approximation
prior0 = sum(clabel == -1); % prior0: number of negative points
prior1 = sum(clabel == 1); % prior1: number of positive points
% Calculate decision values for training data
if strcmp(cf.kernel,'linear')
dval = X(:,1:end-(param.bias>0)) * cf.w + cf.b;
else
dval = K(:, cf.support_vector_indices) * cf.alpha_y + cf.b;
end
% Invoke platt function to get sigmoid parameters
[cf.A, cf.B] = platt(dval, clabel, prior0, prior1);
end
end