-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathnnetUpdate.m
67 lines (58 loc) · 2.6 KB
/
nnetUpdate.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
function layers = nnetUpdate(layers, gradient)
%% nnetUpdate update the parameters of the neural network defined by layers.
%% gradient is a structure the same size as layers containing all gradients.
persistent sum_squared_gradient;
nLayers = length(layers);
if isempty(sum_squared_gradient)
sum_squared_gradient = repmat(struct('W', gpuArray.zeros(0), ...
'B', gpuArray.zeros(0)), 1, nLayers);
end
for i = nLayers:-1:1
layers(i).updates = layers(i).updates + 1;
% eps = layers(i).startLR; % constant learning rate scheduling
% eps = layers(i).startLR * exp(-layers(i).updates / 10); % exponential
eps = layers(i).startLR / (1 + layers(i).decr * layers(i).updates); % power
if layers(i).use_adagrad || layers(i).use_adadec
if i == nLayers && layers(i).updates == 1
existsOnGPU(sum_squared_gradient(i).W)
end
if isempty(sum_squared_gradient(i).W)
sum_squared_gradient(i).W = gradient(i).W .^ 2;
end
if isempty(sum_squared_gradient(i).B)
sum_squared_gradient(i).B = gradient(i).B .^ 2;
end
end
if layers(i).use_adagrad
sum_squared_gradient(i).W = sum_squared_gradient(i).W + ...
gradient(i).W .^ 2;
sum_squared_gradient(i).B = sum_squared_gradient(i).B + ...
gradient(i).B .^ 2;
end
if layers(i).use_adadec
sum_squared_gradient(i).W = layers(i).gammaAdaDec * sum_squared_gradient(i).W + ...
gradient(i).W .^ 2;
sum_squared_gradient(i).B = layers(i).gammaAdaDec * sum_squared_gradient(i).B + ...
gradient(i).B .^ 2;
end
if layers(i).use_adagrad && layers(i).updates >= layers(i).startAdaGrad
if layers(i).updates == layers(i).startAdaGrad
eps = eps * 3;
end
layers(i).W = layers(i).W - eps * (gradient(i).W ./ sqrt(1 + sum_squared_gradient(i).W));
layers(i).B = layers(i).B - eps * (gradient(i).B ./ sqrt(1 + sum_squared_gradient(i).B));
else
% It seems like creating extra variables W and B is faster. Aaaah, the mysteries of MATLAB.
% gradW = gradient(i).W;
% gradB = gradient(i).B;%
% W = layers(i).W;
% B = layers(i).B;
% W = W - eps*gradW;
% B = B - eps*gradB;
% layers(i).W = W;
% layers(i).B = B;
layers(i).W = layers(i).W - eps * gradient(i).W;
layers(i).B = layers(i).B - eps * gradient(i).B;
end
layers(i).eps = eps;
end