-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathclassifierLogisticRegression.m
479 lines (393 loc) · 20.1 KB
/
classifierLogisticRegression.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
function [model] = classifierLogisticRegression( varargin )
% train a k class logistic regression classifier, with optional regularization via l2 norm of weight vector(s)
%
% in:
% - examples - #examples x #voxels
% - labels - #examples x 1
% - optional (see examples below for usage guidelines)
% - 'lambda',<value> - scalar (if 0, no regularization, default is 1)
% - 'lambda','crossvalidation',<vector of lambda values>
% - 'labelsgroup',<labelsgroup> (#examples x 1, useful together with 'lambda','crossvalidation')
%
%
% out:
% - a model structure, with fields
% - w - a 1+#features x #classes matrix (first row contains bias terms for each class)
% - weights - #features x #classes matrix (same as w(2:end,:))
% - biases - 1 x #classes vector (same as w(1,:))
% dependencies:
% - carl rasmussen's minimize.m function (an adapted version bundled below)
% - classifierLogisticRegression_gradients.m
% (contains the functions computing gradients, necessary because minimize requires a function
% to produce the objective value and gradients for the current weight guess)
%
% notes:
% - learns weights w to maximize the log likelihood of the data
% - the l2 penalty is -0.5 * lambda * sum_#classes_c norm(w(2:end,c))^2
% (the square of the l2 norm of the weight vector for each class, added over classes)
%
% history:
% - 2009 june - created from previous code - [email protected]
%
% examples:
%
% [model] = classifierLogisticRegression(examples,labels);
% [model] = classifierLogisticRegression(examples,labels,'lambda',10); % l2 lambda = 10
% [model] = classifierLogisticRegression(examples,labels,'lambda',0); % no regularization
%
% cross-validates within the training set (leave-one-example-of-each-class-out) to decide on
% which lambda value to use (slow)
% [model] = classifierLogisticRegression(examples,labels,'lambda','crossvalidation',[1,0.1,10,0.01,100,0.001,1000]
%
% cross-validates within the training set as before but uses the group label of every example to
% do leave-one-group-out cross-validation within the training set (faster). the groups could be
% runs, for instance, or any kind of division of the data, as long as examples that are very
% close together in time (i.e. the hemodynamic responses overlap, they are in the same block)
% are kept in the same group.
%
% [model] = classifierLogisticRegression(examples,labels,'lambda','crossvalidation',[1,0.1,10,0.01,100,0.001,1000],'labelsgroup',labelsgroup);
%
%% process arguments
this = 'classifierLogisticRegression';
if nargin < 2; eval(sprintf('help %s;',this)); return; else
examples = varargin{1};
labels = varargin{2};
% defaults for options
regularization = 'L2'; lambda = 1;
runSilent = 1;
labelsGroup = [];
optimizationMethod = 'minimize';
if nargin > 2
% there are additional arguments to process
idx = 3;
while idx <= nargin
argName = varargin{idx}; idx = idx + 1;
switch argName
case {'lambda'}
lambda = varargin{idx}; idx = idx + 1;
switch lambda
case {'crossvalidation'}
% extra arguments are needed
lambdaRange = varargin{idx}; idx = idx + 1;
otherwise
% we're done
end
case {'labelsGroup'}
labelsGroup = varargin{idx}; idx = idx + 1;
case {'runSilent'}
runSilent = 1;
otherwise
fprintf('%s: unknown argument %s\n',this,argName); pause
end
end
end
end
%% compute information about labels
%sortedLabelValues = sort(unique(labels)); %commented out by JR
% conditions = %sortedLabelValues; %commented out by JR
% nClasses = length(sortedLabelValues); %commented out by JR
nClasses = size(labels,2);
[nExamples,nFeatures] = size(examples);
% find examples in each class and create a binary indicator mask
delta = zeros(nExamples,nClasses);
for c = 1:nClasses
%label = sortedLabelValues(c); %commented out by JR
indices{c} = find(labels(:,c) == 1); %changed '== labels' to '== 1' to conform to mvpa toolbox standards
nperclass(c) = length(indices{c});
delta(indices{c},c) = 1;
end
%% if requested, run a cross validation to set the value of lambda
if isequal(lambda,'crossvalidation')
fprintf('%s: using crossvalidation to find lambda\n',this);
if isempty(labelsGroup)
% we have no group labels, create fake group labels to run leave-one-example-out
nGroups = min(nperclass);
labelsGroup = zeros(nExamples,1);
idx = 1;
for c = 1:nClasses
nc = nperclass(c);
tmp = zeros(nc,1);
idx = 1;
while nc >= nGroups
tmp(idx:(idx+nGroups-1)) = 1:nGroups;
nc = nc - nGroups;
idx = idx + nGroups;
end
if nc > 0
tmp(idx:(idx+nc-1)) = 1:nc;
end
labelsGroup(indices{c}) = tmp;
end
end
%% set up a leave-one-group-out cross-validation
nl = length(lambdaRange);
groupValues = unique(labelsGroup); nGroups = length(groupValues);
for g = 1:nGroups
group = groupValues(g); mask = (labelsGroup == group);
indicesTest{g} = find( mask); nTest(g) = length(indicesTest{g});
indicesTrain{g} = find(~mask); nTrain(g) = length(indicesTrain{g});
end
%% run it for every value of lamba in the range given
cvresults = zeros(1,nl);
for r = 1:nl
lambda = lambdaRange(r);
fprintf('\ttesting lambda=%s (%d folds)\n',num2str(lambda),nGroups);
predictedLabels = zeros(nExamples,1);
for g = 1:nGroups
examplesTest = examples(indicesTest{g},:);
labelsTest = labels( indicesTest{g},:);
examplesTrain = examples(indicesTrain{g},:);
labelsTrain = labels( indicesTrain{g},:);
cvmodel = classifierLogisticRegressionSimple( examplesTrain,labelsTrain,'lambda',lambda );
decisions = examplesTest * cvmodel.weights + repmat(cvmodel.biases,nTest(g),1);
[discard,maxpos] = max(decisions,[],2);
predictedLabels(indicesTest{g}) = sortedLabelValues(maxpos);
end
cvresults(r) = sum(predictedLabels == labels)/nExamples;
end
[maxval,maxpos] = max(cvresults);
lambda = lambdaRange(maxpos);
fprintf('%s: crossvalidation estimated lambda is %s\n',this,num2str(lambda));
clear examplesTrain examplesTest labelsTrain labelsTest;
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Train classifier
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% initial guess for the weight matrix
%W = randn(nFeatures+1,nClasses) * 0.1;
W = zeros(nFeatures+1,nClasses);
switch optimizationMethod
case {'minimize'}
% use carl rasmussen's minimize.m (renamed to optimize.m to avoid a clash with CVX, appended below)
[Wvector,Wvalue] = optimize(W(:),'classifierLogisticRegression_gradients',1000,examples,labels,lambda,indices,delta,regularization,'minimize');
end
% get the objective at the end
[f,df,fpart,lambdapart] = classifierLogisticRegression_gradients(Wvector,examples,labels,lambda,indices,delta,regularization,'minimize');
if ~runSilent; fprintf('minimize: fpart = %s lambdapart = %s\n',num2str(fpart),num2str(lambdapart)); end
W = reshape(Wvector,size(W));
tmp = W(2:end,:);
if ~runSilent; fprintf('L1(W)=%s\tL2(W)=%s\n',num2str(sqrt(sum(tmp(:).^2))),num2str(sum(abs(tmp(:)))));end
obj = -Wvalue; % it returns -1*objective
%% Pack the results into a struct
model.W = W;
model.biases = W(1,:); % to help people
model.weights = W(2:end,:); % decipher the output
%model.sortedLabelValues = sortedLabelValues;
% training Set information
trainingSetInfo.nExamples = nExamples;
trainingSetInfo.nFeatures = nFeatures;
trainingSetInfo.nLabels = nClasses;
trainingSetInfo.nClasses = nClasses; % same thing as labels
%trainingSetInfo.sortedLabelValues = sortedLabelValues;
trainingSetInfo.classPriors = nperclass / sum(nperclass);
model.trainingSetInfo = trainingSetInfo;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% a function that can compute the various gradients used by classifierLogisticRegression,
% used inside the optimization loop in that function
%
% history:
% - 2009 june - created from previous code - [email protected]
%
function [f,df,fpart,lambdapart] = classifierLogisticRegression_gradients(Wvectorized,X,Y,lambda,yindices,ymask,regularization,optimization);
[n,m] = size(X); k = size(ymask,2); tm = m + 1;
W = reshape(Wvectorized,[tm k]);
df = zeros(tm,k);
% 1) the E part, each example contributes to df estimates for its own class only
df(1,:) = sum(ymask,1);
for j = 1:k
df(2:end,j) = sum(X(yindices{j},:),1)';
end
% 2) the log part (hence the - signs)
tmp1 = X * W(2:end,:) + repmat(W(1,:),n,1);
tmp2 = exp(tmp1); % n x k
tmp3 = tmp2 ./ repmat(sum(tmp2,2),1,k); % n x k
df(1,:) = df(1,:) - sum(tmp3,1);
for j = 1:k
df(2:end,j) = df(2:end,j) - sum(X .* repmat(tmp3(:,j),[1,m]),1)';
end
%f_eterm = sum(tmp1.*ymask,2);
%f_logterm = log(sum(tmp2,2));
%f = sum( fterm - f_logterm);
f = sum( sum(tmp1.*ymask,2) - log(sum(tmp2,2)) );
fpart = f;
tmp = W(2:end,:);
% norms are not over bias term
switch regularization
case {'L2'}
lambdapart = 0.5*lambda*sum(tmp(:).^2);
f = f - lambdapart;
df(2:end,:) = df(2:end,:)-lambda*tmp; % vectorize it
end
df = df(:); % vectorize it
switch optimization
case {'minimize'}
% the function (-1 is because the optimization package does minimization)
% (this is the only part of the bound where E appears, so only compute this)
f = -1*f;
df = -1*df;
otherwise
% do nothing to the function and gradient output
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% carl edward rasmussen's minimize modified to
% - not print anything
% - use less stringent convergence criteria
% to be used in inner loops of other code
function [X, fX, i] = optimize(X, f, length, varargin)
% Minimize a differentiable multivariate function.
%
% Usage: [X, fX, i] = minimize(X, f, length, P1, P2, P3, ... )
%
% where the starting point is given by "X" (D by 1), and the function named in
% the string "f", must return a function value and a vector of partial
% derivatives of f wrt X, the "length" gives the length of the run: if it is
% positive, it gives the maximum number of line searches, if negative its
% absolute gives the maximum allowed number of function evaluations. You can
% (optionally) give "length" a second component, which will indicate the
% reduction in function value to be expected in the first line-search (defaults
% to 1.0). The parameters P1, P2, P3, ... are passed on to the function f.
%
% The function returns when either its length is up, or if no further progress
% can be made (ie, we are at a (local) minimum, or so close that due to
% numerical problems, we cannot get any closer). NOTE: If the function
% terminates within a few iterations, it could be an indication that the
% function values and derivatives are not consistent (ie, there may be a bug in
% the implementation of your "f" function). The function returns the found
% solution "X", a vector of function values "fX" indicating the progress made
% and "i" the number of iterations (line searches or function evaluations,
% depending on the sign of "length") used.
%
% The Polack-Ribiere flavour of conjugate gradients is used to compute search
% directions, and a line search using quadratic and cubic polynomial
% approximations and the Wolfe-Powell stopping criteria is used together with
% the slope ratio method for guessing initial step sizes. Additionally a bunch
% of checks are made to make sure that exploration is taking place and that
% extrapolation will not be unboundedly large.
%
% See also: checkgrad
%
% Copyright (C) 2001 - 2006 by Carl Edward Rasmussen (2006-09-08).
INT = 0.1; % don't reevaluate within 0.1 of the limit of the current bracket
EXT = 3.0; % extrapolate maximum 3 times the current step-size
MAX = 20; % max 20 function evaluations per line search
RATIO = 10; % maximum allowed slope ratio
SIG = 0.1; RHO = SIG/2; % SIG and RHO are the constants controlling the Wolfe-
% Powell conditions. SIG is the maximum allowed absolute ratio between
% previous and new slopes (derivatives in the search direction), thus setting
% SIG to low (positive) values forces higher precision in the line-searches.
% RHO is the minimum allowed fraction of the expected (from the slope at the
% initial point in the linesearch). Constants must satisfy 0 < RHO < SIG < 1.
% Tuning of SIG (depending on the nature of the function to be optimized) may
% speed up the minimization; it is probably not worth playing much with RHO.
% The code falls naturally into 3 parts, after the initial line search is
% started in the direction of steepest descent. 1) we first enter a while loop
% which uses point 1 (p1) and (p2) to compute an extrapolation (p3), until we
% have extrapolated far enough (Wolfe-Powell conditions). 2) if necessary, we
% enter the second loop which takes p2, p3 and p4 chooses the subinterval
% containing a (local) minimum, and interpolates it, unil an acceptable point
% is found (Wolfe-Powell conditions). Note, that points are always maintained
% in order p0 <= p1 <= p2 < p3 < p4. 3) compute a new search direction using
% conjugate gradients (Polack-Ribiere flavour), or revert to steepest if there
% was a problem in the previous line-search. Return the best value so far, if
% two consecutive line-searches fail, or whenever we run out of function
% evaluations or line-searches. During extrapolation, the "f" function may fail
% either with an error or returning Nan or Inf, and minimize should handle this
% gracefully.
if max(size(length)) == 2, red=length(2); length=length(1); else red=1; end
if length>0, S='Linesearch'; else S='Function evaluation'; end
i = 0; % zero the run length counter
ls_failed = 0; % no previous line search has failed
[f0 df0] = feval(f, X, varargin{:}); % get function value and gradient
fX = f0;
i = i + (length<0); % count epochs?!
s = -df0; d0 = -s'*s; % initial search direction (steepest) and slope
x3 = red/(1-d0); % initial step is red/(|s|+1)
f0prev = f0; % FP
while i < abs(length) % while not finished
i = i + (length>0); % count iterations?!
X0 = X; F0 = f0; dF0 = df0; % make a copy of current values
if length>0, M = MAX; else M = min(MAX, -length-i); end
while 1 % keep extrapolating as long as necessary
x2 = 0; f2 = f0; d2 = d0; f3 = f0; df3 = df0;
success = 0;
while ~success && M > 0
try
M = M - 1; i = i + (length<0); % count epochs?!
[f3 df3] = feval(f, X+x3*s, varargin{:});
if isnan(f3) || isinf(f3) || any(isnan(df3)+isinf(df3)), error(''), end
success = 1;
catch % catch any error which occured in f
x3 = (x2+x3)/2; % bisect and try again
end
end
if f3 < F0, X0 = X+x3*s; F0 = f3; dF0 = df3; end % keep best values
d3 = df3'*s; % new slope
if d3 > SIG*d0 || f3 > f0+x3*RHO*d0 || M == 0 % are we done extrapolating?
break
end
x1 = x2; f1 = f2; d1 = d2; % move point 2 to point 1
x2 = x3; f2 = f3; d2 = d3; % move point 3 to point 2
A = 6*(f1-f2)+3*(d2+d1)*(x2-x1); % make cubic extrapolation
B = 3*(f2-f1)-(2*d1+d2)*(x2-x1);
x3 = x1-d1*(x2-x1)^2/(B+sqrt(B*B-A*d1*(x2-x1))); % num. error possible, ok!
if ~isreal(x3) || isnan(x3) || isinf(x3) || x3 < 0 % num prob | wrong sign?
x3 = x2*EXT; % extrapolate maximum amount
elseif x3 > x2*EXT % new point beyond extrapolation limit?
x3 = x2*EXT; % extrapolate maximum amount
elseif x3 < x2+INT*(x2-x1) % new point too close to previous point?
x3 = x2+INT*(x2-x1);
end
end % end extrapolation
while (abs(d3) > -SIG*d0 || f3 > f0+x3*RHO*d0) && M > 0 % keep interpolating
if d3 > 0 || f3 > f0+x3*RHO*d0 % choose subinterval
x4 = x3; f4 = f3; d4 = d3; % move point 3 to point 4
else
x2 = x3; f2 = f3; d2 = d3; % move point 3 to point 2
end
if f4 > f0
x3 = x2-(0.5*d2*(x4-x2)^2)/(f4-f2-d2*(x4-x2)); % quadratic interpolation
else
A = 6*(f2-f4)/(x4-x2)+3*(d4+d2); % cubic interpolation
B = 3*(f4-f2)-(2*d2+d4)*(x4-x2);
x3 = x2+(sqrt(B*B-A*d2*(x4-x2)^2)-B)/A; % num. error possible, ok!
end
if isnan(x3) || isinf(x3)
x3 = (x2+x4)/2; % if we had a numerical problem then bisect
end
x3 = max(min(x3, x4-INT*(x4-x2)),x2+INT*(x4-x2)); % don't accept too close
[f3 df3] = feval(f, X+x3*s, varargin{:});
if f3 < F0, X0 = X+x3*s; F0 = f3; dF0 = df3; end % keep best values
M = M - 1; i = i + (length<0); % count epochs?!
d3 = df3'*s; % new slope
end % end interpolation
if abs(d3) < -SIG*d0 && f3 < f0+x3*RHO*d0 % if line search succeeded
X = X+x3*s; f0 = f3; fX = [fX' f0]'; % update variables
change = abs((f0prev-f0)/(f0prev)); % FP
f0prev = f0; % FP
%if ((i<=10) | (~rem(i,10))) fprintf('%s %6i; Value %4.10e %f\r', S, i, f0,change); end % FP
% fprintf('%s %6i; Value %4.6e\r', S, i, f0);
s = (df3'*df3-df0'*df3)/(df0'*df0)*s - df3; % Polack-Ribiere CG direction
df0 = df3; % swap derivatives
d3 = d0; d0 = df0'*s;
if d0 > 0 % new slope must be negative
s = -df0; d0 = -s'*s; % otherwise use steepest direction
end
x3 = x3 * min(RATIO, d3/(d0-realmin)); % slope ratio but max RATIO
ls_failed = 0; % this line search did not fail
if change < 0.001; %fprintf('converged with objective percent change %f\n',change);
break;
end
else
X = X0; f0 = F0; df0 = dF0; % restore best point so far
if ls_failed || i > abs(length) % line search failed twice in a row
break; % or we ran out of time, so we give up
end
s = -df0; d0 = -s'*s; % try steepest
x3 = 1/(1-d0);
ls_failed = 1; % this line search failed
end
end