From 430db2bb8c1dec14556fef88fe7eb2b8f5846dcd Mon Sep 17 00:00:00 2001 From: Ronghang Hu Date: Wed, 5 Aug 2015 17:28:49 -0700 Subject: [PATCH] Clean up RMSprop to be compatible with new solver interface Clean up the RMS prop solver to adjust to new solver interface that uses accumulated gradients and refactored regularization. --- examples/mnist/lenet_solver_rmsprop.prototxt | 6 +- include/caffe/solver.hpp | 28 ++- python/caffe/classifier.py | 17 -- src/caffe/proto/caffe.proto | 31 ++- src/caffe/solver.cpp | 200 ++++------------- src/caffe/test/test_gradient_based_solver.cpp | 205 +++++++++--------- 6 files changed, 185 insertions(+), 302 deletions(-) diff --git a/examples/mnist/lenet_solver_rmsprop.prototxt b/examples/mnist/lenet_solver_rmsprop.prototxt index 2c9724be447..74dadc51069 100644 --- a/examples/mnist/lenet_solver_rmsprop.prototxt +++ b/examples/mnist/lenet_solver_rmsprop.prototxt @@ -23,7 +23,5 @@ snapshot: 5000 snapshot_prefix: "examples/mnist/lenet_rmsprop" # solver mode: CPU or GPU solver_mode: GPU -solver_type:RMSPROP -rms_decay:0.98 - - +solver_type: RMSPROP +rms_decay: 0.98 diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp index f57e3cec843..4d677e042fe 100644 --- a/include/caffe/solver.hpp +++ b/include/caffe/solver.hpp @@ -130,17 +130,25 @@ class AdaGradSolver : public SGDSolver { template -class RMSpropSolver : public SGDSolver { -public: - explicit RMSpropSolver(const SolverParameter& param) - : SGDSolver(param) { } - explicit RMSpropSolver(const string& param_file) - : SGDSolver(param_file) { } +class RMSPropSolver : public SGDSolver { + public: + explicit RMSPropSolver(const SolverParameter& param) + : SGDSolver(param) { constructor_sanity_check(); } + explicit RMSPropSolver(const string& param_file) + : SGDSolver(param_file) { constructor_sanity_check(); } -protected: - virtual void ComputeUpdateValue(); + protected: + virtual void ComputeUpdateValue(int param_id, Dtype rate); + void constructor_sanity_check() { + CHECK_EQ(0, this->param_.momentum()) + << "Momentum cannot be used with RMSProp."; + CHECK_GE(this->param_.rms_decay(), 0) + << "rms_decay should lie between 0 and 1."; + CHECK_LT(this->param_.rms_decay(), 1) + << "rms_decay should lie between 0 and 1."; + } - DISABLE_COPY_AND_ASSIGN(RMSpropSolver); + DISABLE_COPY_AND_ASSIGN(RMSPropSolver); }; template @@ -155,7 +163,7 @@ Solver* GetSolver(const SolverParameter& param) { case SolverParameter_SolverType_ADAGRAD: return new AdaGradSolver(param); case SolverParameter_SolverType_RMSPROP: - return new RMSpropSolver(param); + return new RMSPropSolver(param); default: LOG(FATAL) << "Unknown SolverType: " << type; } diff --git a/python/caffe/classifier.py b/python/caffe/classifier.py index d5fe8a14927..537193db8f8 100644 --- a/python/caffe/classifier.py +++ b/python/caffe/classifier.py @@ -23,24 +23,7 @@ class Classifier(caffe.Net): def __init__(self, model_file, pretrained_file, image_dims=None, mean=None, input_scale=None, raw_scale=None, channel_swap=None): -<<<<<<< HEAD caffe.Net.__init__(self, model_file, pretrained_file, caffe.TEST) -======= - """ - Take - image_dims: dimensions to scale input for cropping/sampling. - Default is to scale to net input size for whole-image crop. - gpu, mean, input_scale, raw_scale, channel_swap: params for - preprocessing options. - """ - caffe.Net.__init__(self, model_file, pretrained_file, caffe.TEST) - caffe.set_phase_test() - - if gpu: - caffe.set_mode_gpu() - else: - caffe.set_mode_cpu() ->>>>>>> Implement RMSprop # configure pre-processing in_ = self.inputs[0] diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index 5e830fc1b73..a9f39899eb5 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -96,11 +96,7 @@ message NetParameter { // NOTE // Update the next available ID when you add a new SolverParameter field. // -<<<<<<< HEAD -// SolverParameter next available ID: 37 (last added: iter_size) -======= -// SolverParameter next available ID: 37 (last added: rms_decay) ->>>>>>> Implement RMSprop +// SolverParameter next available ID: 38 (last added: rms_decay) message SolverParameter { ////////////////////////////////////////////////////////////////////////////// // Specifying the train and test networks @@ -155,7 +151,23 @@ message SolverParameter { optional int32 max_iter = 7; // the maximum number of iterations // accumulate gradients over `iter_size` x `batch_size` instances optional int32 iter_size = 36 [default = 1]; - optional string lr_policy = 8; // The learning rate decay policy. + + // The learning rate decay policy. The currently implemented learning rate + // policies are as follows: + // - fixed: always return base_lr. + // - step: return base_lr * gamma ^ (floor(iter / step)) + // - exp: return base_lr * gamma ^ iter + // - inv: return base_lr * (1 + gamma * iter) ^ (- power) + // - multistep: similar to step but it allows non uniform steps defined by + // stepvalue + // - poly: the effective learning rate follows a polynomial decay, to be + // zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power) + // - sigmoid: the effective learning rate follows a sigmod decay + // return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize)))) + // + // where base_lr, max_iter, gamma, step, stepvalue and power are defined + // in the solver parameter protocol buffer, and iter is the current iteration. + optional string lr_policy = 8; optional float gamma = 9; // The parameter to compute the learning rate. optional float power = 10; // The parameter to compute the learning rate. optional float momentum = 11; // The momentum value. @@ -200,9 +212,10 @@ message SolverParameter { optional SolverType solver_type = 30 [default = SGD]; // numerical stability for AdaGrad optional float delta = 31 [default = 1e-8]; - - //RMSprop decay value - optional float rms_decay = 36; + + // RMSProp decay value + // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t) + optional float rms_decay = 37; // If true, print information about the state of the net that may help with // debugging learning problems. diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 2a02dd307fe..18b8a0c96e2 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -644,7 +644,6 @@ void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { Dtype momentum = this->param_.momentum(); Dtype local_rate = rate * net_params_lr[param_id]; switch (Caffe::mode()) { -<<<<<<< HEAD case Caffe::CPU: { // save history momentum for stepping back caffe_copy(net_params[param_id]->count(), @@ -665,53 +664,6 @@ void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { caffe_copy(net_params[param_id]->count(), this->update_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); -======= - case Caffe::CPU: - for (int param_id = 0; param_id < net_params.size(); ++param_id) { - // save history momentum for stepping back - caffe_copy(net_params[param_id]->count(), - this->history_[param_id]->cpu_data(), - this->update_[param_id]->mutable_cpu_data()); - - Dtype local_rate = rate * net_params_lr[param_id]; - Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - - if (local_decay) { - if (regularization_type == "L2") { - // add weight decay - caffe_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } else if (regularization_type == "L1") { - caffe_cpu_sign(net_params[param_id]->count(), - net_params[param_id]->cpu_data(), - this->temp_[param_id]->mutable_cpu_data()); - caffe_axpy(net_params[param_id]->count(), - local_decay, - this->temp_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } else { - LOG(FATAL) << "Unknown regularization type: " << regularization_type; - } - } - - // update history - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - this->history_[param_id]->mutable_cpu_data()); - - // compute uppate: step back then over step - caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, - this->history_[param_id]->cpu_data(), -momentum, - this->update_[param_id]->mutable_cpu_data()); - - // copy - caffe_copy(net_params[param_id]->count(), - this->update_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } ->>>>>>> Implement RMSprop break; } case Caffe::GPU: { @@ -824,135 +776,71 @@ void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { } template -void RMSpropSolver::ComputeUpdateValue() { +void RMSPropSolver::ComputeUpdateValue(int param_id, Dtype rate) { const vector > >& net_params = this->net_->params(); const vector& net_params_lr = this->net_->params_lr(); - const vector& net_params_weight_decay = - this->net_->params_weight_decay(); // get the learning rate - Dtype rate = this->GetLearningRate(); Dtype delta = this->param_.delta(); Dtype rms_decay = this->param_.rms_decay(); + Dtype local_rate = rate * net_params_lr[param_id]; - if (this->param_.display() && this->iter_ % this->param_.display() == 0) { - LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; - } - Dtype weight_decay = this->param_.weight_decay(); - string regularization_type = this->param_.regularization_type(); switch (Caffe::mode()) { case Caffe::CPU: - for (int param_id = 0; param_id < net_params.size(); ++param_id) { - Dtype local_rate = rate * net_params_lr[param_id]; - Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - - if (local_decay) { - if (regularization_type == "L2") { - // add weight decay - caffe_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } else if (regularization_type == "L1") { - caffe_cpu_sign(net_params[param_id]->count(), - net_params[param_id]->cpu_data(), - this->temp_[param_id]->mutable_cpu_data()); - caffe_axpy(net_params[param_id]->count(), - local_decay, - this->temp_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } else { - LOG(FATAL) << "Unknown regularization type: " << regularization_type; - } - } - - //Compute RMSstep - // compute square of gradient in update - caffe_powx(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), Dtype(2), - this->update_[param_id]->mutable_cpu_data()); - - // update history - caffe_cpu_axpby(net_params[param_id] -> count(), - Dtype(1-rms_decay), this->update_[param_id]->cpu_data(), - rms_decay, this->history_[param_id]-> mutable_cpu_data()); + // compute square of gradient in update + caffe_powx(net_params[param_id]->count(), + net_params[param_id]->cpu_diff(), Dtype(2), + this->update_[param_id]->mutable_cpu_data()); - // prepare update - caffe_powx(net_params[param_id]->count(), - this->history_[param_id]->cpu_data(), Dtype(0.5), - this->update_[param_id]->mutable_cpu_data()); + // update history + caffe_cpu_axpby(net_params[param_id] -> count(), + Dtype(1-rms_decay), this->update_[param_id]->cpu_data(), + rms_decay, this->history_[param_id]-> mutable_cpu_data()); + // prepare update + caffe_powx(net_params[param_id]->count(), + this->history_[param_id]->cpu_data(), Dtype(0.5), + this->update_[param_id]->mutable_cpu_data()); - caffe_add_scalar(net_params[param_id]->count(), - delta, this->update_[param_id]->mutable_cpu_data()); + caffe_add_scalar(net_params[param_id]->count(), + delta, this->update_[param_id]->mutable_cpu_data()); - caffe_div(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), - this->update_[param_id]->cpu_data(), - this->update_[param_id]->mutable_cpu_data()); + caffe_div(net_params[param_id]->count(), + net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(), + this->update_[param_id]->mutable_cpu_data()); - // scale and copy - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - this->update_[param_id]->cpu_data(), Dtype(0), - net_params[param_id]->mutable_cpu_diff()); - } + // scale and copy + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + this->update_[param_id]->cpu_data(), Dtype(0), + net_params[param_id]->mutable_cpu_diff()); break; case Caffe::GPU: #ifndef CPU_ONLY - for (int param_id = 0; param_id < net_params.size(); ++param_id) { - - Dtype local_rate = rate * net_params_lr[param_id]; - Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - - if (local_decay) { - if (regularization_type == "L2") { - // add weight decay - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); - } else if (regularization_type == "L1") { - caffe_gpu_sign(net_params[param_id]->count(), - net_params[param_id]->gpu_data(), - this->temp_[param_id]->mutable_gpu_data()); - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, - this->temp_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); - } else { - LOG(FATAL) << "Unknown regularization type: " << regularization_type; - } - } - - //Compute RMSstep - // compute square of gradient in update - caffe_gpu_powx(net_params[param_id]->count(), - net_params[param_id]->gpu_diff(), Dtype(2), - this->update_[param_id]->mutable_gpu_data()); - - // update history - caffe_gpu_axpby(net_params[param_id] -> count(), - Dtype(1-rms_decay), this->update_[param_id]->gpu_data(), - rms_decay, this->history_[param_id]-> mutable_gpu_data()); + // compute square of gradient in update + caffe_gpu_powx(net_params[param_id]->count(), + net_params[param_id]->gpu_diff(), Dtype(2), + this->update_[param_id]->mutable_gpu_data()); - // prepare update - caffe_gpu_powx(net_params[param_id]->count(), - this->history_[param_id]->gpu_data(), Dtype(0.5), - this->update_[param_id]->mutable_gpu_data()); + // update history + caffe_gpu_axpby(net_params[param_id] -> count(), + Dtype(1-rms_decay), this->update_[param_id]->gpu_data(), + rms_decay, this->history_[param_id]-> mutable_gpu_data()); + // prepare update + caffe_gpu_powx(net_params[param_id]->count(), + this->history_[param_id]->gpu_data(), Dtype(0.5), + this->update_[param_id]->mutable_gpu_data()); - caffe_gpu_add_scalar(net_params[param_id]->count(), - delta, this->update_[param_id]->mutable_gpu_data()); + caffe_gpu_add_scalar(net_params[param_id]->count(), + delta, this->update_[param_id]->mutable_gpu_data()); - caffe_gpu_div(net_params[param_id]->count(), - net_params[param_id]->gpu_diff(), - this->update_[param_id]->gpu_data(), - this->update_[param_id]->mutable_gpu_data()); + caffe_gpu_div(net_params[param_id]->count(), + net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(), + this->update_[param_id]->mutable_gpu_data()); - caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - this->update_[param_id]->gpu_data(), Dtype(0), - net_params[param_id]->mutable_gpu_diff()); - } + caffe_gpu_axpby(net_params[param_id]->count(), local_rate, + this->update_[param_id]->gpu_data(), Dtype(0), + net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif @@ -966,6 +854,6 @@ INSTANTIATE_CLASS(Solver); INSTANTIATE_CLASS(SGDSolver); INSTANTIATE_CLASS(NesterovSolver); INSTANTIATE_CLASS(AdaGradSolver); -INSTANTIATE_CLASS(RMSpropSolver); +INSTANTIATE_CLASS(RMSPropSolver); } // namespace caffe diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp index 22dbfc0d8ab..e1c3c8102a1 100644 --- a/src/caffe/test/test_gradient_based_solver.cpp +++ b/src/caffe/test/test_gradient_based_solver.cpp @@ -52,17 +52,13 @@ class GradientBasedSolverTest : public MultiDeviceTest { } InitSolver(param); delta_ = (solver_type() == SolverParameter_SolverType_ADAGRAD || - solver_type() == SolverParameter_SolverType_RMSPROP) ? - param.delta() : 0; + solver_type() == SolverParameter_SolverType_RMSPROP) ? + param.delta() : 0; } void RunLeastSquaresSolver(const Dtype learning_rate, -<<<<<<< HEAD - const Dtype weight_decay, const Dtype momentum, const int num_iters, - const int iter_size = 1) { -======= - const Dtype weight_decay, const Dtype momentum, const Dtype rms_decay, const int num_iters) { ->>>>>>> Implement RMSprop + const Dtype weight_decay, const Dtype momentum, const Dtype rms_decay, + const int num_iters, const int iter_size = 1) { ostringstream proto; proto << "max_iter: " << num_iters << " " @@ -221,7 +217,8 @@ class GradientBasedSolverTest : public MultiDeviceTest { update_value /= std::sqrt(history_value + grad * grad) + delta_; break; case SolverParameter_SolverType_RMSPROP: - update_value /= (std::sqrt(rms_decay*history_value + grad*grad*(1-rms_decay)) + delta_) ; + update_value /= std::sqrt(rms_decay*history_value + + grad * grad * (1 - rms_decay)) + delta_; break; default: LOG(FATAL) << "Unknown solver type: " << solver_type(); @@ -288,12 +285,13 @@ class GradientBasedSolverTest : public MultiDeviceTest { } void CheckAccumulation(const Dtype kLearningRate, const Dtype kWeightDecay, - const Dtype kMomentum, const int kNumIters, const int kIterSize) { + const Dtype kMomentum, const Dtype rms_decay, const int kNumIters, + const int kIterSize) { const double kPrecision = 1e-2; const double kMinPrecision = 1e-7; // Solve without accumulation and save parameters. this->RunLeastSquaresSolver(kLearningRate, kWeightDecay, kMomentum, - kNumIters); + rms_decay, kNumIters); // Save parameters for comparison. Net& net = *this->solver_->net(); const vector > >& param_blobs = @@ -305,7 +303,7 @@ class GradientBasedSolverTest : public MultiDeviceTest { } // Solve by equivalent accumulation of gradients over divided batches. this->RunLeastSquaresSolver(kLearningRate, kWeightDecay, kMomentum, - kNumIters, kIterSize); + rms_decay, kNumIters, kIterSize); Net& net_accum = *this->solver_->net(); const vector > >& accum_params = net_accum.layer_by_name("innerprod")->blobs(); @@ -343,18 +341,19 @@ class GradientBasedSolverTest : public MultiDeviceTest { // matches the solver's (K+1)th update. void TestLeastSquaresUpdate(const Dtype learning_rate = 1.0, const Dtype weight_decay = 0.0, const Dtype momentum = 0.0, - const Dtype rms_decay=0.0, const int iter_to_check = 0) { + const Dtype rms_decay = 0.0, const int iter_to_check = 0) { // Initialize the solver and run K (= iter_to_check) solver iterations. - RunLeastSquaresSolver(learning_rate, weight_decay, momentum, rms_decay, iter_to_check ); + RunLeastSquaresSolver(learning_rate, weight_decay, momentum, rms_decay, + iter_to_check); // Compute the (K+1)th update using the analytic least squares gradient. vector > > updated_params; ComputeLeastSquaresUpdate(learning_rate, weight_decay, momentum, - rms_decay,&updated_params); + rms_decay, &updated_params); // Reinitialize the solver and run K+1 solver iterations. - RunLeastSquaresSolver(learning_rate, weight_decay, momentum, - rms_decay,iter_to_check + 1); + RunLeastSquaresSolver(learning_rate, weight_decay, momentum, rms_decay, + iter_to_check + 1); // Check that the solver's solution matches ours. CheckLeastSquaresUpdate(updated_params); @@ -400,21 +399,11 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithMomentum) { const Dtype kLearningRate = 1.0; const Dtype kWeightDecay = 0.0; const Dtype kMomentum = 0.5; + const Dtype kRMSDecay = 0.0; const int kNumIters = 1; for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, 0.0, i); - } -} - -TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithRsmDecay) { - typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 1.0; - const Dtype kWeightDecay = 0.0; - const Dtype kMomentum = 0.5; - const Dtype kRmsDecay = 0.0; - const int kNumIters = 1; - for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, kRmsDecay, i); + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, + kRMSDecay, i); } } @@ -423,9 +412,11 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) { const Dtype kLearningRate = 1.0; const Dtype kWeightDecay = 0.0; const Dtype kMomentum = 0.5; + const Dtype kRMSDecay = 0.0; const int kNumIters = 4; for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, 0.0, i); + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, + kRMSDecay, i); } } @@ -434,9 +425,11 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverything) { const Dtype kLearningRate = 0.01; const Dtype kWeightDecay = 0.1; const Dtype kMomentum = 0.9; + const Dtype kRMSDecay = 0.0; const int kNumIters = 4; for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, 0.0, i); + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, + kRMSDecay, i); } } @@ -445,10 +438,11 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { const Dtype kLearningRate = 0.01; const Dtype kWeightDecay = 0.1; const Dtype kMomentum = 0.9; + const Dtype kRMSDecay = 0.0; const int kNumIters = 4; const int kIterSize = 2; - this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters, - kIterSize); + this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kRMSDecay, + kNumIters, kIterSize); } template @@ -488,168 +482,167 @@ TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateWithEverything) { const Dtype kLearningRate = 0.01; const Dtype kWeightDecay = 0.1; const Dtype kMomentum = 0.0; - const Dtype kRmsDecay = 0.0; + const Dtype kRMSDecay = 0.0; const int kNumIters = 4; for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, kRmsDecay, i); + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, + kRMSDecay, i); } } +TYPED_TEST(AdaGradSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.1; + const Dtype kMomentum = 0.0; + const Dtype kRMSDecay = 0.0; + const int kNumIters = 4; + const int kIterSize = 2; + this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kRMSDecay, + kNumIters, kIterSize); +} template -class RMSpropSolverTest : public GradientBasedSolverTest { +class NesterovSolverTest : public GradientBasedSolverTest { typedef typename TypeParam::Dtype Dtype; protected: virtual void InitSolver(const SolverParameter& param) { - this->solver_.reset(new RMSpropSolver(param)); + this->solver_.reset(new NesterovSolver(param)); } virtual SolverParameter_SolverType solver_type() { - return SolverParameter_SolverType_RMSPROP; + return SolverParameter_SolverType_NESTEROV; } }; -TYPED_TEST_CASE(RMSpropSolverTest, TestDtypesAndDevices); +TYPED_TEST_CASE(NesterovSolverTest, TestDtypesAndDevices); -TYPED_TEST(RMSpropSolverTest, TestRMSpropLeastSquaresUpdate) { +TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdate) { this->TestLeastSquaresUpdate(); } -TYPED_TEST(RMSpropSolverTest, TestRMSpropLeastSquaresUpdateLROneTenth) { +TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateLROneTenth) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 0.1; this->TestLeastSquaresUpdate(kLearningRate); } -<<<<<<< HEAD -TYPED_TEST(AdaGradSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { -======= -TYPED_TEST(RMSpropSolverTest, TestRMSpropLeastSquaresUpdateWithWeightDecay) { +TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithWeightDecay) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 1.0; const Dtype kWeightDecay = 0.5; this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay); } -TYPED_TEST(RMSpropSolverTest, TestRMSpropLeastSquaresUpdateWithRmsDecay) { ->>>>>>> Implement RMSprop +TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithMomentum) { typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 0.01; - const Dtype kWeightDecay = 0.1; - const Dtype kMomentum = 0.0; -<<<<<<< HEAD - const int kNumIters = 4; - const int kIterSize = 2; - this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters, - kIterSize); -======= - const Dtype kRmsdecay = 0.95; + const Dtype kLearningRate = 1.0; + const Dtype kWeightDecay = 0.0; + const Dtype kMomentum = 0.5; + const Dtype kRMSDecay = 0.0; + const int kNumIters = 1; + for (int i = 0; i <= kNumIters; ++i) { + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, + kRMSDecay, i); + } +} + +TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 1.0; + const Dtype kWeightDecay = 0.0; + const Dtype kMomentum = 0.5; + const Dtype kRMSDecay = 0.0; const int kNumIters = 4; for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, kRmsdecay, i); + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, + kRMSDecay, i); } } -TYPED_TEST(RMSpropSolverTest, TestRMSpropLeastSquaresUpdateWithMomentum) { +TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithEverything) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 0.01; const Dtype kWeightDecay = 0.1; const Dtype kMomentum = 0.9; - const Dtype kRmsdecay = 0.0; + const Dtype kRMSDecay = 0.0; const int kNumIters = 4; for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, kRmsdecay, i); + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, + kRMSDecay, i); } } -TYPED_TEST(RMSpropSolverTest, TestRMSpropLeastSquaresUpdateWithEverything) { +TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 0.01; const Dtype kWeightDecay = 0.1; const Dtype kMomentum = 0.9; - const Dtype kRmsdecay = 0.95; + const Dtype kRMSDecay = 0.0; const int kNumIters = 4; - for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, kRmsdecay,i); - } ->>>>>>> Implement RMSprop + const int kIterSize = 2; + this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kRMSDecay, + kNumIters, kIterSize); } template -class NesterovSolverTest : public GradientBasedSolverTest { +class RMSPropSolverTest : public GradientBasedSolverTest { typedef typename TypeParam::Dtype Dtype; protected: virtual void InitSolver(const SolverParameter& param) { - this->solver_.reset(new NesterovSolver(param)); + this->solver_.reset(new RMSPropSolver(param)); } virtual SolverParameter_SolverType solver_type() { - return SolverParameter_SolverType_NESTEROV; + return SolverParameter_SolverType_RMSPROP; } }; -TYPED_TEST_CASE(NesterovSolverTest, TestDtypesAndDevices); - -TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdate) { - this->TestLeastSquaresUpdate(); -} - -TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateLROneTenth) { - typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 0.1; - this->TestLeastSquaresUpdate(kLearningRate); -} +TYPED_TEST_CASE(RMSPropSolverTest, TestDtypesAndDevices); -TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithWeightDecay) { +TYPED_TEST(RMSPropSolverTest, TestRMSPropLeastSquaresUpdateWithWeightDecay) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 1.0; const Dtype kWeightDecay = 0.5; this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay); } -TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithMomentum) { - typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 1.0; - const Dtype kWeightDecay = 0.0; - const Dtype kMomentum = 0.5; - const int kNumIters = 1; - for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, 0.0, i); - } -} - -TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) { +TYPED_TEST(RMSPropSolverTest, TestRMSPropLeastSquaresUpdateWithRmsDecay) { typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 1.0; + const Dtype kLearningRate = 0.01; const Dtype kWeightDecay = 0.0; - const Dtype kMomentum = 0.5; + const Dtype kMomentum = 0.0; + const Dtype kRMSDecay = 0.95; const int kNumIters = 4; for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, 0.0, i); + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, + kRMSDecay, i); } } -TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithEverything) { +TYPED_TEST(RMSPropSolverTest, TestRMSPropLeastSquaresUpdateWithEverything) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 0.01; const Dtype kWeightDecay = 0.1; - const Dtype kMomentum = 0.9; - const Dtype kRmsDecay = 0.0; + const Dtype kMomentum = 0.0; + const Dtype kRMSDecay = 0.95; const int kNumIters = 4; for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, kRmsDecay, i); + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, + kRMSDecay, i); } } -TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { +TYPED_TEST(RMSPropSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 0.01; const Dtype kWeightDecay = 0.1; - const Dtype kMomentum = 0.9; + const Dtype kMomentum = 0.0; + const Dtype kRMSDecay = 0.95; const int kNumIters = 4; const int kIterSize = 2; - this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters, - kIterSize); + this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kRMSDecay, + kNumIters, kIterSize); } } // namespace caffe