Merge pull request #2782 from matthiasplappert/adadelta

AdaDelta Solver (v3)
BVLC · Aug 11, 2015 · ebc3e3b · ebc3e3b
2 parents cfd31fe + f2e523e
commit ebc3e3b
Show file tree

Hide file tree

Showing 7 changed files with 365 additions and 2 deletions.
diff --git a/examples/mnist/lenet_adadelta_solver.prototxt b/examples/mnist/lenet_adadelta_solver.prototxt
@@ -0,0 +1,24 @@
+# The train/test net protocol buffer definition
+net: "examples/mnist/lenet_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of MNIST, we have test batch size 100 and 100 test iterations,
+# covering the full 10,000 testing images.
+test_iter: 100
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 1.0
+lr_policy: "fixed"
+momentum: 0.95
+weight_decay: 0.0005
+# Display every 100 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 10000
+# snapshot intermediate results
+snapshot: 5000
+snapshot_prefix: "examples/mnist/lenet_adadelta"
+# solver mode: CPU or GPU
+solver_mode: GPU
+solver_type: ADADELTA
+delta: 1e-6
diff --git a/examples/mnist/mnist_autoencoder_solver_adadelta.prototxt b/examples/mnist/mnist_autoencoder_solver_adadelta.prototxt
@@ -0,0 +1,19 @@
+net: "examples/mnist/mnist_autoencoder.prototxt"
+test_state: { stage: 'test-on-train' }
+test_iter: 500
+test_state: { stage: 'test-on-test' }
+test_iter: 100
+test_interval: 500
+test_compute_loss: true
+base_lr: 1.0
+lr_policy: "fixed"
+momentum: 0.95
+delta: 1e-8
+display: 100
+max_iter: 65000
+weight_decay: 0.0005
+snapshot: 10000
+snapshot_prefix: "examples/mnist/mnist_autoencoder_adadelta_train"
+# solver mode: CPU or GPU
+solver_mode: GPU
+solver_type: ADADELTA
diff --git a/examples/mnist/train_mnist_autoencoder_adadelta.sh b/examples/mnist/train_mnist_autoencoder_adadelta.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+./build/tools/caffe train \
+ --solver=examples/mnist/mnist_autoencoder_solver_adadelta.prototxt
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
@@ -158,6 +158,21 @@ class RMSPropSolver : public SGDSolver<Dtype> {
  DISABLE_COPY_AND_ASSIGN(RMSPropSolver);
 };
 
+template <typename Dtype>
+class AdaDeltaSolver : public SGDSolver<Dtype> {
+ public:
+ explicit AdaDeltaSolver(const SolverParameter& param)
+ : SGDSolver<Dtype>(param) { AdaDeltaPreSolve(); }
+ explicit AdaDeltaSolver(const string& param_file)
+ : SGDSolver<Dtype>(param_file) { AdaDeltaPreSolve(); }
+
+ protected:
+ void AdaDeltaPreSolve();
+ virtual void ComputeUpdateValue(int param_id, Dtype rate);
+
+ DISABLE_COPY_AND_ASSIGN(AdaDeltaSolver);
+};
+
 template <typename Dtype>
 Solver<Dtype>* GetSolver(const SolverParameter& param) {
  SolverParameter_SolverType type = param.solver_type();
@@ -171,6 +186,8 @@ Solver<Dtype>* GetSolver(const SolverParameter& param) {
  return new AdaGradSolver<Dtype>(param);
  case SolverParameter_SolverType_RMSPROP:
  return new RMSPropSolver<Dtype>(param);
+ case SolverParameter_SolverType_ADADELTA:
+ return new AdaDeltaSolver<Dtype>(param);
  default:
  LOG(FATAL) << "Unknown SolverType: " << type;
  }

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
@@ -215,6 +215,7 @@ message SolverParameter {
  NESTEROV = 1;
  ADAGRAD = 2;
  RMSPROP = 3;
+ ADADELTA = 4;
  }
  optional SolverType solver_type = 30 [default = SGD];
  // numerical stability for AdaGrad

diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
@@ -934,10 +934,157 @@ void RMSPropSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
  }
 }
 
+template <typename Dtype>
+void AdaDeltaSolver<Dtype>::AdaDeltaPreSolve() {
+ // Add the extra history entries for AdaDelta after those from
+ // SGDSolver::PreSolve
+ const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+ for (int i = 0; i < net_params.size(); ++i) {
+ const vector<int>& shape = net_params[i]->shape();
+ this->history_.push_back(
+ shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
+ }
+}
+
+template <typename Dtype>
+void AdaDeltaSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
+ const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+ const vector<float>& net_params_lr = this->net_->params_lr();
+ Dtype delta = this->param_.delta();
+ Dtype momentum = this->param_.momentum();
+ Dtype local_rate = rate * net_params_lr[param_id];
+ size_t update_history_offset = net_params.size();
+ switch (Caffe::mode()) {
+ case Caffe::CPU: {
+ // compute square of gradient in update
+ caffe_powx(net_params[param_id]->count(),
+ net_params[param_id]->cpu_diff(), Dtype(2),
+ this->update_[param_id]->mutable_cpu_data());
+
+ // update history of gradients
+ caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+ this->update_[param_id]->cpu_data(), momentum,
+ this->history_[param_id]->mutable_cpu_data());
+
+ // add delta to history to guard against dividing by zero later
+ caffe_set(net_params[param_id]->count(), delta,
+ this->temp_[param_id]->mutable_cpu_data());
+
+ caffe_add(net_params[param_id]->count(),
+ this->temp_[param_id]->cpu_data(),
+ this->history_[update_history_offset + param_id]->cpu_data(),
+ this->update_[param_id]->mutable_cpu_data());
+
+ caffe_add(net_params[param_id]->count(),
+ this->temp_[param_id]->cpu_data(),
+ this->history_[param_id]->cpu_data(),
+ this->temp_[param_id]->mutable_cpu_data());
+
+ // divide history of updates by history of gradients
+ caffe_div(net_params[param_id]->count(),
+ this->update_[param_id]->cpu_data(),
+ this->temp_[param_id]->cpu_data(),
+ this->update_[param_id]->mutable_cpu_data());
+
+ // jointly compute the RMS of both for update and gradient history
+ caffe_powx(net_params[param_id]->count(),
+ this->update_[param_id]->cpu_data(), Dtype(0.5),
+ this->update_[param_id]->mutable_cpu_data());
+
+ // compute the update
+ caffe_mul(net_params[param_id]->count(),
+ net_params[param_id]->cpu_diff(),
+ this->update_[param_id]->cpu_data(),
+ net_params[param_id]->mutable_cpu_diff());
+
+ // compute square of update
+ caffe_powx(net_params[param_id]->count(),
+ net_params[param_id]->cpu_diff(), Dtype(2),
+ this->update_[param_id]->mutable_cpu_data());
+
+ // update history of updates
+ caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+ this->update_[param_id]->cpu_data(), momentum,
+ this->history_[update_history_offset + param_id]->mutable_cpu_data());
+
+ // apply learning rate
+ caffe_cpu_scale(net_params[param_id]->count(), local_rate,
+ net_params[param_id]->cpu_diff(),
+ net_params[param_id]->mutable_cpu_diff());
+ break;
+ }
+ case Caffe::GPU: {
+#ifndef CPU_ONLY
+ // compute square of gradient in update
+ caffe_gpu_powx(net_params[param_id]->count(),
+ net_params[param_id]->gpu_diff(), Dtype(2),
+ this->update_[param_id]->mutable_gpu_data());
+
+ // update history of gradients
+ caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+ this->update_[param_id]->gpu_data(), momentum,
+ this->history_[param_id]->mutable_gpu_data());
+
+ // add delta to history to guard against dividing by zero later
+ caffe_gpu_set(net_params[param_id]->count(), delta,
+ this->temp_[param_id]->mutable_gpu_data());
+
+ caffe_gpu_add(net_params[param_id]->count(),
+ this->temp_[param_id]->gpu_data(),
+ this->history_[update_history_offset + param_id]->gpu_data(),
+ this->update_[param_id]->mutable_gpu_data());
+
+ caffe_gpu_add(net_params[param_id]->count(),
+ this->temp_[param_id]->gpu_data(),
+ this->history_[param_id]->gpu_data(),
+ this->temp_[param_id]->mutable_gpu_data());
+
+ // divide history of updates by history of gradients
+ caffe_gpu_div(net_params[param_id]->count(),
+ this->update_[param_id]->gpu_data(),
+ this->temp_[param_id]->gpu_data(),
+ this->update_[param_id]->mutable_gpu_data());
+
+ // jointly compute the RMS of both for update and gradient history
+ caffe_gpu_powx(net_params[param_id]->count(),
+ this->update_[param_id]->gpu_data(), Dtype(0.5),
+ this->update_[param_id]->mutable_gpu_data());
+
+ // compute the update and copy to net_diff
+ caffe_gpu_mul(net_params[param_id]->count(),
+ net_params[param_id]->gpu_diff(),
+ this->update_[param_id]->gpu_data(),
+ net_params[param_id]->mutable_gpu_diff());
+
+ // compute square of update
+ caffe_gpu_powx(net_params[param_id]->count(),
+ net_params[param_id]->gpu_diff(), Dtype(2),
+ this->update_[param_id]->mutable_gpu_data());
+
+ // update history of updates
+ caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+ this->update_[param_id]->gpu_data(), momentum,
+ this->history_[update_history_offset + param_id]->mutable_gpu_data());
+
+ // apply learning rate
+ caffe_gpu_scale(net_params[param_id]->count(), local_rate,
+ net_params[param_id]->gpu_diff(),
+ net_params[param_id]->mutable_gpu_diff());
+#else
+ NO_GPU;
+#endif
+ break;
+ }
+ default:
+ LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+ }
+}
+
 INSTANTIATE_CLASS(Solver);
 INSTANTIATE_CLASS(SGDSolver);
 INSTANTIATE_CLASS(NesterovSolver);
 INSTANTIATE_CLASS(AdaGradSolver);
 INSTANTIATE_CLASS(RMSPropSolver);
+INSTANTIATE_CLASS(AdaDeltaSolver);
 
 } // namespace caffe