-
Notifications
You must be signed in to change notification settings - Fork 35
/
train_kernel_ridge.m
172 lines (151 loc) · 6 KB
/
train_kernel_ridge.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
function model = train_kernel_ridge(param, X, Y)
% Trains a kernel ridge regression model.
%
% Usage:
% cf = train_kernel_fda(param, X, Y)
%
%Parameters:
% X - [samples x features] matrix of training samples (should
% not include intercept term/column of 1's)
% -OR-
% [samples x samples] kernel matrix
% Y - [samples x 1] vector of responses (for univariate
% regression) -or-
% [samples x m] matrix of responses (for multivariate
% regression with m response variables)
%
% param - struct with hyperparameters:
% .lambda - regularization parameter for ridge regression, ranges
% from 0 (no regularization) to infinity. For lambda=0,
% the model yields standard linear (OLS) regression, for
% lambda > 0 it yields ridge regression (default 1).
% .kernel - kernel function:
% 'linear' - linear kernel ker(x,y) = x' y
% 'rbf' - radial basis function or Gaussian kernel
% ker(x,y) = exp(-gamma * |x-y|^2);
% 'polynomial' - polynomial kernel
% ker(x,y) = (gamma * x * y' + coef0)^degree
% Alternatively, a custom kernel can be provided if there
% is a function called *_kernel is in the MATLAB path,
% where "*" is the name of the kernel (e.g. rbf_kernel).
%
% If a precomputed kernel matrix is provided as X, set
% param.kernel = 'precomputed'.
%
% HYPERPARAMETERS for specific kernels:
%
% gamma - (kernel: rbf, polynomial) controls the 'width' of the
% kernel. If set to 'auto', gamma is set to 1/(nr of features)
% (default 'auto')
% coef0 - (kernel: polynomial) constant added to the polynomial
% term in the polynomial kernel. If 0, the kernel is
% homogenous (default 1)
% degree - (kernel: polynomial) degree of the polynomial term. A too
% high degree makes overfitting likely (default 2)
%
% IMPLEMENTATION DETAILS:
% The solution to the kernel ridge regression problem is given by
%
% alpha = (K + lambda I)^-1 y (dual form)
%
% where lambda is the regularization hyperparameter and K is the [samples x
% samples] kernel matrix. Predictions on new data are then obtained using
%
% f(x) = alpha' * k
%
% where k = (k(x1, x), ... , k(xn, x))' is the vector of kernel evaluations
% between the training data and the test sample x.
%
% REFERENCE:
% https://people.eecs.berkeley.edu/~bartlett/courses/281b-sp08/10.pdf
% (c) Matthias Treder
[N, P] = size(X);
model = struct();
% indicates whether kernel matrix has been precomputed
is_precomputed = strcmp(param.kernel,'precomputed');
%% Center X
if strcmp(param.kernel, 'linear')
model.m = mean(X);
X = X - repmat(model.m, [N 1]);
end
%% Set kernel hyperparameter defaults
if ischar(param.gamma) && strcmp(param.gamma,'auto') && ~is_precomputed
param.gamma = 1/size(X,2);
end
%% Check if hyperparameters need to be tuned
tune_params = {};
tune_kernel_params = 0;
if numel(param.lambda)>1, tune_params = [tune_params {'lambda'}]; end
if ~is_precomputed
if numel(param.gamma)>1, tune_params = [tune_params {'gamma'}]; tune_kernel_params = 1; end
if numel(param.coef0)>1, tune_params = [tune_params {'coef0'}]; tune_kernel_params = 1; end
if numel(param.degree)>1, tune_params = [tune_params {'degree'}]; tune_kernel_params = 1; end
end
%% Compute kernel
use_K = 0;
if is_precomputed
K = X;
use_K = 1;
elseif tune_kernel_params == 0
kernelfun = eval(['@' param.kernel '_kernel']); % Kernel function
K = kernelfun(param, X); % Compute kernel matrix
use_K = 1;
end
%% Hyperparameter tuning
if ~isempty(tune_params)
MAE = @(y, ypred) -sum(abs(y - ypred)); % - mean absolute error
if use_K
param = mv_tune_hyperparameters(param, K, Y, @train_kernel_ridge, @test_kernel_ridge, ...
MAE, tune_params, param.k, 1);
else
param = mv_tune_hyperparameters(param, X, Y, @train_kernel_ridge, @test_kernel_ridge, ...
MAE, tune_params, param.k, 0);
end
% compute kernel with the optimal hyperparameters
kernelfun = eval(['@' param.kernel '_kernel']);
K = kernelfun(param, X);
end
%% Perform regularization and calculate weights
alpha = (K + param.lambda * eye(N)) \ Y;
%% Set up model struct
model.alpha = alpha;
model.lambda = param.lambda;
model.kernel = param.kernel;
if strcmp(param.kernel, 'linear')
model.w = X' * alpha;
model.b = mean(Y) - model.m * model.w;
else
if ~is_precomputed
model.kernelfun = kernelfun;
model.X_train = X;
end
% Hyperparameters
model.gamma = param.gamma;
model.coef0 = param.coef0;
model.degree = param.degree;
end
%% Correlation constraint
if ~isempty(param.correlation_bound)
% calculate correlation between y and residual
yc = Y - mean(Y);
yhat = K * model.alpha;
yhatc = yhat - mean(yhat);
y_residual_correlation_uncorrected = corr(yc, yc - yhatc);
if y_residual_correlation_uncorrected < param.correlation_bound
% dont fix it if it aint broken
model.theta = 1;
elseif param.correlation_bound == 0
model.theta = (yc'*yc) / (yc' * yhatc);
else
y2 = yc' * yc;
yhat2 = yhatc' * yhatc;
yyhat = yc' * yhatc;
rho2 = param.correlation_bound^2;
c = yyhat^2 - rho2*y2*yhat2;
% since we use a square to solve for a we get two solutions
% one gives us corr(y,e) = rho, the other corr(y,e) = -rho
model.theta = y2 * yyhat * (1-rho2)/c - y2/c * sqrt( rho2 * (1-rho2) * (y2*yhat2 - yyhat^2));
end
% scale coefficients
model.alpha = model.alpha * model.theta;
end