-
Notifications
You must be signed in to change notification settings - Fork 18.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AdaDelta Solver (v3) #2782
AdaDelta Solver (v3) #2782
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# The train/test net protocol buffer definition | ||
net: "examples/mnist/lenet_train_test.prototxt" | ||
# test_iter specifies how many forward passes the test should carry out. | ||
# In the case of MNIST, we have test batch size 100 and 100 test iterations, | ||
# covering the full 10,000 testing images. | ||
test_iter: 100 | ||
# Carry out testing every 500 training iterations. | ||
test_interval: 500 | ||
# The base learning rate, momentum and the weight decay of the network. | ||
base_lr: 1.0 | ||
lr_policy: "fixed" | ||
momentum: 0.95 | ||
weight_decay: 0.0005 | ||
# Display every 100 iterations | ||
display: 100 | ||
# The maximum number of iterations | ||
max_iter: 10000 | ||
# snapshot intermediate results | ||
snapshot: 5000 | ||
snapshot_prefix: "examples/mnist/lenet_adadelta" | ||
# solver mode: CPU or GPU | ||
solver_mode: GPU | ||
solver_type: ADADELTA | ||
delta: 1e-6 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
net: "examples/mnist/mnist_autoencoder.prototxt" | ||
test_state: { stage: 'test-on-train' } | ||
test_iter: 500 | ||
test_state: { stage: 'test-on-test' } | ||
test_iter: 100 | ||
test_interval: 500 | ||
test_compute_loss: true | ||
base_lr: 1.0 | ||
lr_policy: "fixed" | ||
momentum: 0.95 | ||
delta: 1e-8 | ||
display: 100 | ||
max_iter: 65000 | ||
weight_decay: 0.0005 | ||
snapshot: 10000 | ||
snapshot_prefix: "examples/mnist/mnist_autoencoder_adadelta_train" | ||
# solver mode: CPU or GPU | ||
solver_mode: GPU | ||
solver_type: ADADELTA |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#!/bin/bash | ||
|
||
./build/tools/caffe train \ | ||
--solver=examples/mnist/mnist_autoencoder_solver_adadelta.prototxt |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -934,10 +934,157 @@ void RMSPropSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) { | |
} | ||
} | ||
|
||
template <typename Dtype> | ||
void AdaDeltaSolver<Dtype>::AdaDeltaPreSolve() { | ||
// Add the extra history entries for AdaDelta after those from | ||
// SGDSolver::PreSolve | ||
const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); | ||
for (int i = 0; i < net_params.size(); ++i) { | ||
const vector<int>& shape = net_params[i]->shape(); | ||
this->history_.push_back( | ||
shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape))); | ||
} | ||
} | ||
|
||
template <typename Dtype> | ||
void AdaDeltaSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) { | ||
const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); | ||
const vector<float>& net_params_lr = this->net_->params_lr(); | ||
Dtype delta = this->param_.delta(); | ||
Dtype momentum = this->param_.momentum(); | ||
Dtype local_rate = rate * net_params_lr[param_id]; | ||
size_t update_history_offset = net_params.size(); | ||
switch (Caffe::mode()) { | ||
case Caffe::CPU: { | ||
// compute square of gradient in update | ||
caffe_powx(net_params[param_id]->count(), | ||
net_params[param_id]->cpu_diff(), Dtype(2), | ||
this->update_[param_id]->mutable_cpu_data()); | ||
|
||
// update history of gradients | ||
caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, | ||
this->update_[param_id]->cpu_data(), momentum, | ||
this->history_[param_id]->mutable_cpu_data()); | ||
|
||
// add delta to history to guard against dividing by zero later | ||
caffe_set(net_params[param_id]->count(), delta, | ||
this->temp_[param_id]->mutable_cpu_data()); | ||
|
||
caffe_add(net_params[param_id]->count(), | ||
this->temp_[param_id]->cpu_data(), | ||
this->history_[update_history_offset + param_id]->cpu_data(), | ||
this->update_[param_id]->mutable_cpu_data()); | ||
|
||
caffe_add(net_params[param_id]->count(), | ||
this->temp_[param_id]->cpu_data(), | ||
this->history_[param_id]->cpu_data(), | ||
this->temp_[param_id]->mutable_cpu_data()); | ||
|
||
// divide history of updates by history of gradients | ||
caffe_div(net_params[param_id]->count(), | ||
this->update_[param_id]->cpu_data(), | ||
this->temp_[param_id]->cpu_data(), | ||
this->update_[param_id]->mutable_cpu_data()); | ||
|
||
// jointly compute the RMS of both for update and gradient history | ||
caffe_powx(net_params[param_id]->count(), | ||
this->update_[param_id]->cpu_data(), Dtype(0.5), | ||
this->update_[param_id]->mutable_cpu_data()); | ||
|
||
// compute the update | ||
caffe_mul(net_params[param_id]->count(), | ||
net_params[param_id]->cpu_diff(), | ||
this->update_[param_id]->cpu_data(), | ||
net_params[param_id]->mutable_cpu_diff()); | ||
|
||
// compute square of update | ||
caffe_powx(net_params[param_id]->count(), | ||
net_params[param_id]->cpu_diff(), Dtype(2), | ||
this->update_[param_id]->mutable_cpu_data()); | ||
|
||
// update history of updates | ||
caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, | ||
this->update_[param_id]->cpu_data(), momentum, | ||
this->history_[update_history_offset + param_id]->mutable_cpu_data()); | ||
|
||
// apply learning rate | ||
caffe_cpu_scale(net_params[param_id]->count(), local_rate, | ||
net_params[param_id]->cpu_diff(), | ||
net_params[param_id]->mutable_cpu_diff()); | ||
break; | ||
} | ||
case Caffe::GPU: { | ||
#ifndef CPU_ONLY | ||
// compute square of gradient in update | ||
caffe_gpu_powx(net_params[param_id]->count(), | ||
net_params[param_id]->gpu_diff(), Dtype(2), | ||
this->update_[param_id]->mutable_gpu_data()); | ||
|
||
// update history of gradients | ||
caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, | ||
this->update_[param_id]->gpu_data(), momentum, | ||
this->history_[param_id]->mutable_gpu_data()); | ||
|
||
// add delta to history to guard against dividing by zero later | ||
caffe_gpu_set(net_params[param_id]->count(), delta, | ||
this->temp_[param_id]->mutable_gpu_data()); | ||
|
||
caffe_gpu_add(net_params[param_id]->count(), | ||
this->temp_[param_id]->gpu_data(), | ||
this->history_[update_history_offset + param_id]->gpu_data(), | ||
this->update_[param_id]->mutable_gpu_data()); | ||
|
||
caffe_gpu_add(net_params[param_id]->count(), | ||
this->temp_[param_id]->gpu_data(), | ||
this->history_[param_id]->gpu_data(), | ||
this->temp_[param_id]->mutable_gpu_data()); | ||
|
||
// divide history of updates by history of gradients | ||
caffe_gpu_div(net_params[param_id]->count(), | ||
this->update_[param_id]->gpu_data(), | ||
this->temp_[param_id]->gpu_data(), | ||
this->update_[param_id]->mutable_gpu_data()); | ||
|
||
// jointly compute the RMS of both for update and gradient history | ||
caffe_gpu_powx(net_params[param_id]->count(), | ||
this->update_[param_id]->gpu_data(), Dtype(0.5), | ||
this->update_[param_id]->mutable_gpu_data()); | ||
|
||
// compute the update and copy to net_diff | ||
caffe_gpu_mul(net_params[param_id]->count(), | ||
net_params[param_id]->gpu_diff(), | ||
this->update_[param_id]->gpu_data(), | ||
net_params[param_id]->mutable_gpu_diff()); | ||
|
||
// compute square of update | ||
caffe_gpu_powx(net_params[param_id]->count(), | ||
net_params[param_id]->gpu_diff(), Dtype(2), | ||
this->update_[param_id]->mutable_gpu_data()); | ||
|
||
// update history of updates | ||
caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, | ||
this->update_[param_id]->gpu_data(), momentum, | ||
this->history_[update_history_offset + param_id]->mutable_gpu_data()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's add
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed in matthiasplappert@322a9de |
||
|
||
// apply learning rate | ||
caffe_gpu_scale(net_params[param_id]->count(), local_rate, | ||
net_params[param_id]->gpu_diff(), | ||
net_params[param_id]->mutable_gpu_diff()); | ||
#else | ||
NO_GPU; | ||
#endif | ||
break; | ||
} | ||
default: | ||
LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); | ||
} | ||
} | ||
|
||
INSTANTIATE_CLASS(Solver); | ||
INSTANTIATE_CLASS(SGDSolver); | ||
INSTANTIATE_CLASS(NesterovSolver); | ||
INSTANTIATE_CLASS(AdaGradSolver); | ||
INSTANTIATE_CLASS(RMSPropSolver); | ||
INSTANTIATE_CLASS(AdaDeltaSolver); | ||
|
||
} // namespace caffe |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
let's add
local_rate
multiplication after this line, where you have computed square of update (don't scale update bylocal_rate
before computing square of update).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed in matthiasplappert@322a9de