Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AdaDelta Solver (v3) #2782

Merged
merged 3 commits into from
Aug 11, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions examples/mnist/lenet_adadelta_solver.prototxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# The train/test net protocol buffer definition
net: "examples/mnist/lenet_train_test.prototxt"
# test_iter specifies how many forward passes the test should carry out.
# In the case of MNIST, we have test batch size 100 and 100 test iterations,
# covering the full 10,000 testing images.
test_iter: 100
# Carry out testing every 500 training iterations.
test_interval: 500
# The base learning rate, momentum and the weight decay of the network.
base_lr: 1.0
lr_policy: "fixed"
momentum: 0.95
weight_decay: 0.0005
# Display every 100 iterations
display: 100
# The maximum number of iterations
max_iter: 10000
# snapshot intermediate results
snapshot: 5000
snapshot_prefix: "examples/mnist/lenet_adadelta"
# solver mode: CPU or GPU
solver_mode: GPU
solver_type: ADADELTA
delta: 1e-6
19 changes: 19 additions & 0 deletions examples/mnist/mnist_autoencoder_solver_adadelta.prototxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
net: "examples/mnist/mnist_autoencoder.prototxt"
test_state: { stage: 'test-on-train' }
test_iter: 500
test_state: { stage: 'test-on-test' }
test_iter: 100
test_interval: 500
test_compute_loss: true
base_lr: 1.0
lr_policy: "fixed"
momentum: 0.95
delta: 1e-8
display: 100
max_iter: 65000
weight_decay: 0.0005
snapshot: 10000
snapshot_prefix: "examples/mnist/mnist_autoencoder_adadelta_train"
# solver mode: CPU or GPU
solver_mode: GPU
solver_type: ADADELTA
4 changes: 4 additions & 0 deletions examples/mnist/train_mnist_autoencoder_adadelta.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

./build/tools/caffe train \
--solver=examples/mnist/mnist_autoencoder_solver_adadelta.prototxt
17 changes: 17 additions & 0 deletions include/caffe/solver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,21 @@ class RMSPropSolver : public SGDSolver<Dtype> {
DISABLE_COPY_AND_ASSIGN(RMSPropSolver);
};

template <typename Dtype>
class AdaDeltaSolver : public SGDSolver<Dtype> {
public:
explicit AdaDeltaSolver(const SolverParameter& param)
: SGDSolver<Dtype>(param) { AdaDeltaPreSolve(); }
explicit AdaDeltaSolver(const string& param_file)
: SGDSolver<Dtype>(param_file) { AdaDeltaPreSolve(); }

protected:
void AdaDeltaPreSolve();
virtual void ComputeUpdateValue(int param_id, Dtype rate);

DISABLE_COPY_AND_ASSIGN(AdaDeltaSolver);
};

template <typename Dtype>
Solver<Dtype>* GetSolver(const SolverParameter& param) {
SolverParameter_SolverType type = param.solver_type();
Expand All @@ -171,6 +186,8 @@ Solver<Dtype>* GetSolver(const SolverParameter& param) {
return new AdaGradSolver<Dtype>(param);
case SolverParameter_SolverType_RMSPROP:
return new RMSPropSolver<Dtype>(param);
case SolverParameter_SolverType_ADADELTA:
return new AdaDeltaSolver<Dtype>(param);
default:
LOG(FATAL) << "Unknown SolverType: " << type;
}
Expand Down
1 change: 1 addition & 0 deletions src/caffe/proto/caffe.proto
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ message SolverParameter {
NESTEROV = 1;
ADAGRAD = 2;
RMSPROP = 3;
ADADELTA = 4;
}
optional SolverType solver_type = 30 [default = SGD];
// numerical stability for AdaGrad
Expand Down
147 changes: 147 additions & 0 deletions src/caffe/solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -934,10 +934,157 @@ void RMSPropSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
}
}

template <typename Dtype>
void AdaDeltaSolver<Dtype>::AdaDeltaPreSolve() {
// Add the extra history entries for AdaDelta after those from
// SGDSolver::PreSolve
const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
for (int i = 0; i < net_params.size(); ++i) {
const vector<int>& shape = net_params[i]->shape();
this->history_.push_back(
shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
}
}

template <typename Dtype>
void AdaDeltaSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
const vector<float>& net_params_lr = this->net_->params_lr();
Dtype delta = this->param_.delta();
Dtype momentum = this->param_.momentum();
Dtype local_rate = rate * net_params_lr[param_id];
size_t update_history_offset = net_params.size();
switch (Caffe::mode()) {
case Caffe::CPU: {
// compute square of gradient in update
caffe_powx(net_params[param_id]->count(),
net_params[param_id]->cpu_diff(), Dtype(2),
this->update_[param_id]->mutable_cpu_data());

// update history of gradients
caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
this->update_[param_id]->cpu_data(), momentum,
this->history_[param_id]->mutable_cpu_data());

// add delta to history to guard against dividing by zero later
caffe_set(net_params[param_id]->count(), delta,
this->temp_[param_id]->mutable_cpu_data());

caffe_add(net_params[param_id]->count(),
this->temp_[param_id]->cpu_data(),
this->history_[update_history_offset + param_id]->cpu_data(),
this->update_[param_id]->mutable_cpu_data());

caffe_add(net_params[param_id]->count(),
this->temp_[param_id]->cpu_data(),
this->history_[param_id]->cpu_data(),
this->temp_[param_id]->mutable_cpu_data());

// divide history of updates by history of gradients
caffe_div(net_params[param_id]->count(),
this->update_[param_id]->cpu_data(),
this->temp_[param_id]->cpu_data(),
this->update_[param_id]->mutable_cpu_data());

// jointly compute the RMS of both for update and gradient history
caffe_powx(net_params[param_id]->count(),
this->update_[param_id]->cpu_data(), Dtype(0.5),
this->update_[param_id]->mutable_cpu_data());

// compute the update
caffe_mul(net_params[param_id]->count(),
net_params[param_id]->cpu_diff(),
this->update_[param_id]->cpu_data(),
net_params[param_id]->mutable_cpu_diff());

// compute square of update
caffe_powx(net_params[param_id]->count(),
net_params[param_id]->cpu_diff(), Dtype(2),
this->update_[param_id]->mutable_cpu_data());

// update history of updates
caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
this->update_[param_id]->cpu_data(), momentum,
this->history_[update_history_offset + param_id]->mutable_cpu_data());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's add local_rate multiplication after this line, where you have computed square of update (don't scale update by local_rate before computing square of update).

caffe_cpu_scale(net_params[param_id]->count(), local_rate,
    net_params[param_id]->cpu_diff(),
    net_params[param_id]->mutable_cpu_diff());

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


// apply learning rate
caffe_cpu_scale(net_params[param_id]->count(), local_rate,
net_params[param_id]->cpu_diff(),
net_params[param_id]->mutable_cpu_diff());
break;
}
case Caffe::GPU: {
#ifndef CPU_ONLY
// compute square of gradient in update
caffe_gpu_powx(net_params[param_id]->count(),
net_params[param_id]->gpu_diff(), Dtype(2),
this->update_[param_id]->mutable_gpu_data());

// update history of gradients
caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
this->update_[param_id]->gpu_data(), momentum,
this->history_[param_id]->mutable_gpu_data());

// add delta to history to guard against dividing by zero later
caffe_gpu_set(net_params[param_id]->count(), delta,
this->temp_[param_id]->mutable_gpu_data());

caffe_gpu_add(net_params[param_id]->count(),
this->temp_[param_id]->gpu_data(),
this->history_[update_history_offset + param_id]->gpu_data(),
this->update_[param_id]->mutable_gpu_data());

caffe_gpu_add(net_params[param_id]->count(),
this->temp_[param_id]->gpu_data(),
this->history_[param_id]->gpu_data(),
this->temp_[param_id]->mutable_gpu_data());

// divide history of updates by history of gradients
caffe_gpu_div(net_params[param_id]->count(),
this->update_[param_id]->gpu_data(),
this->temp_[param_id]->gpu_data(),
this->update_[param_id]->mutable_gpu_data());

// jointly compute the RMS of both for update and gradient history
caffe_gpu_powx(net_params[param_id]->count(),
this->update_[param_id]->gpu_data(), Dtype(0.5),
this->update_[param_id]->mutable_gpu_data());

// compute the update and copy to net_diff
caffe_gpu_mul(net_params[param_id]->count(),
net_params[param_id]->gpu_diff(),
this->update_[param_id]->gpu_data(),
net_params[param_id]->mutable_gpu_diff());

// compute square of update
caffe_gpu_powx(net_params[param_id]->count(),
net_params[param_id]->gpu_diff(), Dtype(2),
this->update_[param_id]->mutable_gpu_data());

// update history of updates
caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
this->update_[param_id]->gpu_data(), momentum,
this->history_[update_history_offset + param_id]->mutable_gpu_data());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's add local_rate multiplication after this line, where you have computed square of update (don't scale update by local_rate before computing square of update).

caffe_gpu_scale(net_params[param_id]->count(), local_rate,
    net_params[param_id]->gpu_diff(),
    net_params[param_id]->mutable_gpu_diff());

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


// apply learning rate
caffe_gpu_scale(net_params[param_id]->count(), local_rate,
net_params[param_id]->gpu_diff(),
net_params[param_id]->mutable_gpu_diff());
#else
NO_GPU;
#endif
break;
}
default:
LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
}
}

INSTANTIATE_CLASS(Solver);
INSTANTIATE_CLASS(SGDSolver);
INSTANTIATE_CLASS(NesterovSolver);
INSTANTIATE_CLASS(AdaGradSolver);
INSTANTIATE_CLASS(RMSPropSolver);
INSTANTIATE_CLASS(AdaDeltaSolver);

} // namespace caffe
Loading