From 430db2bb8c1dec14556fef88fe7eb2b8f5846dcd Mon Sep 17 00:00:00 2001
From: Ronghang Hu <huronghang@hotmail.com>
Date: Wed, 5 Aug 2015 17:28:49 -0700
Subject: [PATCH] Clean up RMSprop to be compatible with new solver interface

Clean up the RMS prop solver to adjust to new solver interface that uses
accumulated gradients and refactored regularization.
---
 examples/mnist/lenet_solver_rmsprop.prototxt  |   6 +-
 include/caffe/solver.hpp                      |  28 ++-
 python/caffe/classifier.py                    |  17 --
 src/caffe/proto/caffe.proto                   |  31 ++-
 src/caffe/solver.cpp                          | 200 ++++-------------
 src/caffe/test/test_gradient_based_solver.cpp | 205 +++++++++---------
 6 files changed, 185 insertions(+), 302 deletions(-)
diff --git a/examples/mnist/lenet_solver_rmsprop.prototxt b/examples/mnist/lenet_solver_rmsprop.prototxt
index 2c9724be447..74dadc51069 100644
--- a/examples/mnist/lenet_solver_rmsprop.prototxt
+++ b/examples/mnist/lenet_solver_rmsprop.prototxt
@@ -23,7 +23,5 @@ snapshot: 5000
 snapshot_prefix: "examples/mnist/lenet_rmsprop"
 # solver mode: CPU or GPU
 solver_mode: GPU
-solver_type:RMSPROP
-rms_decay:0.98
-
-
+solver_type: RMSPROP
+rms_decay: 0.98
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index f57e3cec843..4d677e042fe 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -130,17 +130,25 @@ class AdaGradSolver : public SGDSolver<Dtype> {
 
 
 template <typename Dtype>
-class RMSpropSolver : public SGDSolver<Dtype> {
-public:
-	explicit RMSpropSolver(const SolverParameter& param)
-	      : SGDSolver<Dtype>(param) {  }
-	explicit RMSpropSolver(const string& param_file)
-	      : SGDSolver<Dtype>(param_file) { }
+class RMSPropSolver : public SGDSolver<Dtype> {
+ public:
+  explicit RMSPropSolver(const SolverParameter& param)
+      : SGDSolver<Dtype>(param) { constructor_sanity_check(); }
+  explicit RMSPropSolver(const string& param_file)
+      : SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
 
-protected:
-	virtual void ComputeUpdateValue();
+ protected:
+  virtual void ComputeUpdateValue(int param_id, Dtype rate);
+  void constructor_sanity_check() {
+    CHECK_EQ(0, this->param_.momentum())
+        << "Momentum cannot be used with RMSProp.";
+    CHECK_GE(this->param_.rms_decay(), 0)
+        << "rms_decay should lie between 0 and 1.";
+    CHECK_LT(this->param_.rms_decay(), 1)
+        << "rms_decay should lie between 0 and 1.";
+  }
 
-	DISABLE_COPY_AND_ASSIGN(RMSpropSolver);
+  DISABLE_COPY_AND_ASSIGN(RMSPropSolver);
 };
 
 template <typename Dtype>
@@ -155,7 +163,7 @@ Solver<Dtype>* GetSolver(const SolverParameter& param) {
   case SolverParameter_SolverType_ADAGRAD:
       return new AdaGradSolver<Dtype>(param);
   case SolverParameter_SolverType_RMSPROP:
-      return new RMSpropSolver<Dtype>(param);
+      return new RMSPropSolver<Dtype>(param);
   default:
       LOG(FATAL) << "Unknown SolverType: " << type;
   }
diff --git a/python/caffe/classifier.py b/python/caffe/classifier.py
index d5fe8a14927..537193db8f8 100644
--- a/python/caffe/classifier.py
+++ b/python/caffe/classifier.py
@@ -23,24 +23,7 @@ class Classifier(caffe.Net):
     def __init__(self, model_file, pretrained_file, image_dims=None,
                  mean=None, input_scale=None, raw_scale=None,
                  channel_swap=None):
-<<<<<<< HEAD
         caffe.Net.__init__(self, model_file, pretrained_file, caffe.TEST)
-=======
-        """
-        Take
-        image_dims: dimensions to scale input for cropping/sampling.
-            Default is to scale to net input size for whole-image crop.
-        gpu, mean, input_scale, raw_scale, channel_swap: params for
-            preprocessing options.
-        """
-        caffe.Net.__init__(self, model_file, pretrained_file, caffe.TEST)
-        caffe.set_phase_test()
-
-        if gpu:
-            caffe.set_mode_gpu()
-        else:
-            caffe.set_mode_cpu()
->>>>>>> Implement RMSprop
 
         # configure pre-processing
         in_ = self.inputs[0]
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 5e830fc1b73..a9f39899eb5 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -96,11 +96,7 @@ message NetParameter {
 // NOTE
 // Update the next available ID when you add a new SolverParameter field.
 //
-<<<<<<< HEAD
-// SolverParameter next available ID: 37 (last added: iter_size)
-=======
-// SolverParameter next available ID: 37 (last added: rms_decay)
->>>>>>> Implement RMSprop
+// SolverParameter next available ID: 38 (last added: rms_decay)
 message SolverParameter {
   //////////////////////////////////////////////////////////////////////////////
   // Specifying the train and test networks
@@ -155,7 +151,23 @@ message SolverParameter {
   optional int32 max_iter = 7; // the maximum number of iterations
   // accumulate gradients over `iter_size` x `batch_size` instances
   optional int32 iter_size = 36 [default = 1];
-  optional string lr_policy = 8; // The learning rate decay policy.
+
+  // The learning rate decay policy. The currently implemented learning rate
+  // policies are as follows:
+  //    - fixed: always return base_lr.
+  //    - step: return base_lr * gamma ^ (floor(iter / step))
+  //    - exp: return base_lr * gamma ^ iter
+  //    - inv: return base_lr * (1 + gamma * iter) ^ (- power)
+  //    - multistep: similar to step but it allows non uniform steps defined by
+  //      stepvalue
+  //    - poly: the effective learning rate follows a polynomial decay, to be
+  //      zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power)
+  //    - sigmoid: the effective learning rate follows a sigmod decay
+  //      return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))
+  //
+  // where base_lr, max_iter, gamma, step, stepvalue and power are defined
+  // in the solver parameter protocol buffer, and iter is the current iteration.
+  optional string lr_policy = 8;
   optional float gamma = 9; // The parameter to compute the learning rate.
   optional float power = 10; // The parameter to compute the learning rate.
   optional float momentum = 11; // The momentum value.
@@ -200,9 +212,10 @@ message SolverParameter {
   optional SolverType solver_type = 30 [default = SGD];
   // numerical stability for AdaGrad
   optional float delta = 31 [default = 1e-8];
-  
-  //RMSprop decay value
-  optional float rms_decay = 36;
+
+  // RMSProp decay value
+  // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
+  optional float rms_decay = 37;
 
   // If true, print information about the state of the net that may help with
   // debugging learning problems.
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 2a02dd307fe..18b8a0c96e2 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -644,7 +644,6 @@ void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   Dtype momentum = this->param_.momentum();
   Dtype local_rate = rate * net_params_lr[param_id];
   switch (Caffe::mode()) {
-<<<<<<< HEAD
   case Caffe::CPU: {
     // save history momentum for stepping back
     caffe_copy(net_params[param_id]->count(),
@@ -665,53 +664,6 @@ void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
     caffe_copy(net_params[param_id]->count(),
         this->update_[param_id]->cpu_data(),
         net_params[param_id]->mutable_cpu_diff());
-=======
-  case Caffe::CPU:
-    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
-      // save history momentum for stepping back
-      caffe_copy(net_params[param_id]->count(),
-          this->history_[param_id]->cpu_data(),
-          this->update_[param_id]->mutable_cpu_data());
-
-      Dtype local_rate = rate * net_params_lr[param_id];
-      Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-
-      if (local_decay) {
-        if (regularization_type == "L2") {
-          // add weight decay
-          caffe_axpy(net_params[param_id]->count(),
-              local_decay,
-              net_params[param_id]->cpu_data(),
-              net_params[param_id]->mutable_cpu_diff());
-        } else if (regularization_type == "L1") {
-          caffe_cpu_sign(net_params[param_id]->count(),
-              net_params[param_id]->cpu_data(),
-              this->temp_[param_id]->mutable_cpu_data());
-          caffe_axpy(net_params[param_id]->count(),
-              local_decay,
-              this->temp_[param_id]->cpu_data(),
-              net_params[param_id]->mutable_cpu_diff());
-        } else {
-          LOG(FATAL) << "Unknown regularization type: " << regularization_type;
-        }
-      }
-
-      // update history
-      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-                net_params[param_id]->cpu_diff(), momentum,
-                this->history_[param_id]->mutable_cpu_data());
-
-      // compute uppate: step back then over step
-      caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
-          this->history_[param_id]->cpu_data(), -momentum,
-          this->update_[param_id]->mutable_cpu_data());
-
-      // copy
-      caffe_copy(net_params[param_id]->count(),
-          this->update_[param_id]->cpu_data(),
-          net_params[param_id]->mutable_cpu_diff());
-    }
->>>>>>> Implement RMSprop
     break;
   }
   case Caffe::GPU: {
@@ -824,135 +776,71 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
 }
 
 template <typename Dtype>
-void RMSpropSolver<Dtype>::ComputeUpdateValue() {
+void RMSPropSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
   const vector<float>& net_params_lr = this->net_->params_lr();
-  const vector<float>& net_params_weight_decay =
-      this->net_->params_weight_decay();
 
   // get the learning rate
-  Dtype rate = this->GetLearningRate();
   Dtype delta = this->param_.delta();
   Dtype rms_decay = this->param_.rms_decay();
+  Dtype local_rate = rate * net_params_lr[param_id];
 
-  if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
-    LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
-  }
-  Dtype weight_decay = this->param_.weight_decay();
-  string regularization_type = this->param_.regularization_type();
   switch (Caffe::mode()) {
   case Caffe::CPU:
-    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
-      Dtype local_rate = rate * net_params_lr[param_id];
-      Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-
-      if (local_decay) {
-        if (regularization_type == "L2") {
-          // add weight decay
-          caffe_axpy(net_params[param_id]->count(),
-              local_decay,
-              net_params[param_id]->cpu_data(),
-              net_params[param_id]->mutable_cpu_diff());
-        } else if (regularization_type == "L1") {
-          caffe_cpu_sign(net_params[param_id]->count(),
-              net_params[param_id]->cpu_data(),
-              this->temp_[param_id]->mutable_cpu_data());
-          caffe_axpy(net_params[param_id]->count(),
-              local_decay,
-              this->temp_[param_id]->cpu_data(),
-              net_params[param_id]->mutable_cpu_diff());
-        } else {
-          LOG(FATAL) << "Unknown regularization type: " << regularization_type;
-        }
-      }
-
-      //Compute RMSstep
-      // compute square of gradient in update
-      caffe_powx(net_params[param_id]->count(),
-      		net_params[param_id]->cpu_diff(), Dtype(2),
-					this->update_[param_id]->mutable_cpu_data());
-
-      // update history
-      caffe_cpu_axpby(net_params[param_id] -> count(),
-      		Dtype(1-rms_decay), this->update_[param_id]->cpu_data(),
-					rms_decay, this->history_[param_id]-> mutable_cpu_data());
+    // compute square of gradient in update
+    caffe_powx(net_params[param_id]->count(),
+        net_params[param_id]->cpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_cpu_data());
 
-      // prepare update
-      caffe_powx(net_params[param_id]->count(),
-      		this->history_[param_id]->cpu_data(), Dtype(0.5),
-					this->update_[param_id]->mutable_cpu_data());
+    // update history
+    caffe_cpu_axpby(net_params[param_id] -> count(),
+        Dtype(1-rms_decay), this->update_[param_id]->cpu_data(),
+        rms_decay, this->history_[param_id]-> mutable_cpu_data());
 
+    // prepare update
+    caffe_powx(net_params[param_id]->count(),
+        this->history_[param_id]->cpu_data(), Dtype(0.5),
+        this->update_[param_id]->mutable_cpu_data());
 
-      caffe_add_scalar(net_params[param_id]->count(),
-      		delta, this->update_[param_id]->mutable_cpu_data());
+    caffe_add_scalar(net_params[param_id]->count(),
+        delta, this->update_[param_id]->mutable_cpu_data());
 
-      caffe_div(net_params[param_id]->count(),
-      		net_params[param_id]->cpu_diff(),
-					this->update_[param_id]->cpu_data(),
-					this->update_[param_id]->mutable_cpu_data());
+    caffe_div(net_params[param_id]->count(),
+        net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(),
+        this->update_[param_id]->mutable_cpu_data());
 
-      // scale and copy
-			caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-					this->update_[param_id]->cpu_data(), Dtype(0),
-					net_params[param_id]->mutable_cpu_diff());
-    }
+    // scale and copy
+    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+        this->update_[param_id]->cpu_data(), Dtype(0),
+        net_params[param_id]->mutable_cpu_diff());
     break;
   case Caffe::GPU:
 #ifndef CPU_ONLY
-    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
-
-      Dtype local_rate = rate * net_params_lr[param_id];
-      Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-
-      if (local_decay) {
-        if (regularization_type == "L2") {
-          // add weight decay
-          caffe_gpu_axpy(net_params[param_id]->count(),
-              local_decay,
-              net_params[param_id]->gpu_data(),
-              net_params[param_id]->mutable_gpu_diff());
-        } else if (regularization_type == "L1") {
-          caffe_gpu_sign(net_params[param_id]->count(),
-              net_params[param_id]->gpu_data(),
-              this->temp_[param_id]->mutable_gpu_data());
-          caffe_gpu_axpy(net_params[param_id]->count(),
-              local_decay,
-              this->temp_[param_id]->gpu_data(),
-              net_params[param_id]->mutable_gpu_diff());
-        } else {
-          LOG(FATAL) << "Unknown regularization type: " << regularization_type;
-        }
-      }
-
-      //Compute RMSstep
-      // compute square of gradient in update
-      caffe_gpu_powx(net_params[param_id]->count(),
-      		net_params[param_id]->gpu_diff(), Dtype(2),
-					this->update_[param_id]->mutable_gpu_data());
-
-      // update history
-      caffe_gpu_axpby(net_params[param_id] -> count(),
-      		Dtype(1-rms_decay), this->update_[param_id]->gpu_data(),
-					rms_decay, this->history_[param_id]-> mutable_gpu_data());
+    // compute square of gradient in update
+    caffe_gpu_powx(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_gpu_data());
 
-      // prepare update
-      caffe_gpu_powx(net_params[param_id]->count(),
-      		this->history_[param_id]->gpu_data(), Dtype(0.5),
-					this->update_[param_id]->mutable_gpu_data());
+    // update history
+    caffe_gpu_axpby(net_params[param_id] -> count(),
+        Dtype(1-rms_decay), this->update_[param_id]->gpu_data(),
+        rms_decay, this->history_[param_id]-> mutable_gpu_data());
 
+    // prepare update
+    caffe_gpu_powx(net_params[param_id]->count(),
+        this->history_[param_id]->gpu_data(), Dtype(0.5),
+        this->update_[param_id]->mutable_gpu_data());
 
-      caffe_gpu_add_scalar(net_params[param_id]->count(),
-      		delta, this->update_[param_id]->mutable_gpu_data());
+    caffe_gpu_add_scalar(net_params[param_id]->count(),
+        delta, this->update_[param_id]->mutable_gpu_data());
 
-      caffe_gpu_div(net_params[param_id]->count(),
-      		net_params[param_id]->gpu_diff(),
-					this->update_[param_id]->gpu_data(),
-					this->update_[param_id]->mutable_gpu_data());
+    caffe_gpu_div(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(),
+        this->update_[param_id]->mutable_gpu_data());
 
-			caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-						this->update_[param_id]->gpu_data(), Dtype(0),
-						net_params[param_id]->mutable_gpu_diff());
-    }
+    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+        this->update_[param_id]->gpu_data(), Dtype(0),
+        net_params[param_id]->mutable_gpu_diff());
 #else
     NO_GPU;
 #endif
@@ -966,6 +854,6 @@ INSTANTIATE_CLASS(Solver);
 INSTANTIATE_CLASS(SGDSolver);
 INSTANTIATE_CLASS(NesterovSolver);
 INSTANTIATE_CLASS(AdaGradSolver);
-INSTANTIATE_CLASS(RMSpropSolver);
+INSTANTIATE_CLASS(RMSPropSolver);
 
 }  // namespace caffe
diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index 22dbfc0d8ab..e1c3c8102a1 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -52,17 +52,13 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     }
     InitSolver(param);
     delta_ = (solver_type() == SolverParameter_SolverType_ADAGRAD ||
-    					solver_type() == SolverParameter_SolverType_RMSPROP) ?
-         param.delta() : 0;
+        solver_type() == SolverParameter_SolverType_RMSPROP) ?
+        param.delta() : 0;
   }
 
   void RunLeastSquaresSolver(const Dtype learning_rate,
-<<<<<<< HEAD
-      const Dtype weight_decay, const Dtype momentum, const int num_iters,
-      const int iter_size = 1) {
-=======
-      const Dtype weight_decay, const Dtype momentum, const Dtype rms_decay, const int num_iters) {
->>>>>>> Implement RMSprop
+      const Dtype weight_decay, const Dtype momentum, const Dtype rms_decay,
+      const int num_iters, const int iter_size = 1) {
     ostringstream proto;
     proto <<
        "max_iter: " << num_iters << " "
@@ -221,7 +217,8 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
         update_value /= std::sqrt(history_value + grad * grad) + delta_;
         break;
       case SolverParameter_SolverType_RMSPROP:
-				update_value /= (std::sqrt(rms_decay*history_value + grad*grad*(1-rms_decay)) + delta_) ;
+        update_value /= std::sqrt(rms_decay*history_value
+            + grad * grad * (1 - rms_decay)) + delta_;
         break;
       default:
         LOG(FATAL) << "Unknown solver type: " << solver_type();
@@ -288,12 +285,13 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
   }
 
   void CheckAccumulation(const Dtype kLearningRate, const Dtype kWeightDecay,
-      const Dtype kMomentum, const int kNumIters, const int kIterSize) {
+      const Dtype kMomentum, const Dtype rms_decay, const int kNumIters,
+      const int kIterSize) {
     const double kPrecision = 1e-2;
     const double kMinPrecision = 1e-7;
     // Solve without accumulation and save parameters.
     this->RunLeastSquaresSolver(kLearningRate, kWeightDecay, kMomentum,
-        kNumIters);
+        rms_decay, kNumIters);
     // Save parameters for comparison.
     Net<Dtype>& net = *this->solver_->net();
     const vector<shared_ptr<Blob<Dtype> > >& param_blobs =
@@ -305,7 +303,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     }
     // Solve by equivalent accumulation of gradients over divided batches.
     this->RunLeastSquaresSolver(kLearningRate, kWeightDecay, kMomentum,
-        kNumIters, kIterSize);
+        rms_decay, kNumIters, kIterSize);
     Net<Dtype>& net_accum = *this->solver_->net();
     const vector<shared_ptr<Blob<Dtype> > >& accum_params =
         net_accum.layer_by_name("innerprod")->blobs();
@@ -343,18 +341,19 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
   // matches the solver's (K+1)th update.
   void TestLeastSquaresUpdate(const Dtype learning_rate = 1.0,
       const Dtype weight_decay = 0.0, const Dtype momentum = 0.0,
-			const Dtype rms_decay=0.0, const int iter_to_check = 0) {
+      const Dtype rms_decay = 0.0, const int iter_to_check = 0) {
     // Initialize the solver and run K (= iter_to_check) solver iterations.
-    RunLeastSquaresSolver(learning_rate, weight_decay, momentum, rms_decay, iter_to_check );
+    RunLeastSquaresSolver(learning_rate, weight_decay, momentum, rms_decay,
+        iter_to_check);
 
     // Compute the (K+1)th update using the analytic least squares gradient.
     vector<shared_ptr<Blob<Dtype> > > updated_params;
     ComputeLeastSquaresUpdate(learning_rate, weight_decay, momentum,
-    											rms_decay,&updated_params);
+        rms_decay, &updated_params);
 
     // Reinitialize the solver and run K+1 solver iterations.
-    RunLeastSquaresSolver(learning_rate, weight_decay, momentum,
-    											rms_decay,iter_to_check + 1);
+    RunLeastSquaresSolver(learning_rate, weight_decay, momentum, rms_decay,
+        iter_to_check + 1);
 
     // Check that the solver's solution matches ours.
     CheckLeastSquaresUpdate(updated_params);
@@ -400,21 +399,11 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithMomentum) {
   const Dtype kLearningRate = 1.0;
   const Dtype kWeightDecay = 0.0;
   const Dtype kMomentum = 0.5;
+  const Dtype kRMSDecay = 0.0;
   const int kNumIters = 1;
   for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, 0.0, i);
-  }
-}
-
-TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithRsmDecay) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 1.0;
-  const Dtype kWeightDecay = 0.0;
-  const Dtype kMomentum = 0.5;
-  const Dtype kRmsDecay = 0.0;
-  const int kNumIters = 1;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, kRmsDecay, i);
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum,
+        kRMSDecay, i);
   }
 }
 
@@ -423,9 +412,11 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) {
   const Dtype kLearningRate = 1.0;
   const Dtype kWeightDecay = 0.0;
   const Dtype kMomentum = 0.5;
+  const Dtype kRMSDecay = 0.0;
   const int kNumIters = 4;
   for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, 0.0, i);
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum,
+        kRMSDecay, i);
   }
 }
 
@@ -434,9 +425,11 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverything) {
   const Dtype kLearningRate = 0.01;
   const Dtype kWeightDecay = 0.1;
   const Dtype kMomentum = 0.9;
+  const Dtype kRMSDecay = 0.0;
   const int kNumIters = 4;
   for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, 0.0, i);
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum,
+        kRMSDecay, i);
   }
 }
 
@@ -445,10 +438,11 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
   const Dtype kLearningRate = 0.01;
   const Dtype kWeightDecay = 0.1;
   const Dtype kMomentum = 0.9;
+  const Dtype kRMSDecay = 0.0;
   const int kNumIters = 4;
   const int kIterSize = 2;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
+  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kRMSDecay,
+      kNumIters, kIterSize);
 }
 
 template <typename TypeParam>
@@ -488,168 +482,167 @@ TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateWithEverything) {
   const Dtype kLearningRate = 0.01;
   const Dtype kWeightDecay = 0.1;
   const Dtype kMomentum = 0.0;
-  const Dtype kRmsDecay = 0.0;
+  const Dtype kRMSDecay = 0.0;
   const int kNumIters = 4;
   for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, kRmsDecay, i);
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum,
+        kRMSDecay, i);
   }
 }
 
+TYPED_TEST(AdaGradSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.1;
+  const Dtype kMomentum = 0.0;
+  const Dtype kRMSDecay = 0.0;
+  const int kNumIters = 4;
+  const int kIterSize = 2;
+  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kRMSDecay,
+      kNumIters, kIterSize);
+}
 
 template <typename TypeParam>
-class RMSpropSolverTest : public GradientBasedSolverTest<TypeParam> {
+class NesterovSolverTest : public GradientBasedSolverTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;
 
  protected:
   virtual void InitSolver(const SolverParameter& param) {
-    this->solver_.reset(new RMSpropSolver<Dtype>(param));
+    this->solver_.reset(new NesterovSolver<Dtype>(param));
   }
   virtual SolverParameter_SolverType solver_type() {
-    return SolverParameter_SolverType_RMSPROP;
+    return SolverParameter_SolverType_NESTEROV;
   }
 };
 
-TYPED_TEST_CASE(RMSpropSolverTest, TestDtypesAndDevices);
+TYPED_TEST_CASE(NesterovSolverTest, TestDtypesAndDevices);
 
-TYPED_TEST(RMSpropSolverTest, TestRMSpropLeastSquaresUpdate) {
+TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdate) {
   this->TestLeastSquaresUpdate();
 }
 
-TYPED_TEST(RMSpropSolverTest, TestRMSpropLeastSquaresUpdateLROneTenth) {
+TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateLROneTenth) {
   typedef typename TypeParam::Dtype Dtype;
   const Dtype kLearningRate = 0.1;
   this->TestLeastSquaresUpdate(kLearningRate);
 }
 
-<<<<<<< HEAD
-TYPED_TEST(AdaGradSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
-=======
-TYPED_TEST(RMSpropSolverTest, TestRMSpropLeastSquaresUpdateWithWeightDecay) {
+TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithWeightDecay) {
   typedef typename TypeParam::Dtype Dtype;
   const Dtype kLearningRate = 1.0;
   const Dtype kWeightDecay = 0.5;
   this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay);
 }
 
-TYPED_TEST(RMSpropSolverTest, TestRMSpropLeastSquaresUpdateWithRmsDecay) {
->>>>>>> Implement RMSprop
+TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithMomentum) {
   typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.1;
-  const Dtype kMomentum = 0.0;
-<<<<<<< HEAD
-  const int kNumIters = 4;
-  const int kIterSize = 2;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
-=======
-  const Dtype kRmsdecay = 0.95;
+  const Dtype kLearningRate = 1.0;
+  const Dtype kWeightDecay = 0.0;
+  const Dtype kMomentum = 0.5;
+  const Dtype kRMSDecay = 0.0;
+  const int kNumIters = 1;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum,
+        kRMSDecay, i);
+  }
+}
+
+TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 1.0;
+  const Dtype kWeightDecay = 0.0;
+  const Dtype kMomentum = 0.5;
+  const Dtype kRMSDecay = 0.0;
   const int kNumIters = 4;
   for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, kRmsdecay, i);
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum,
+        kRMSDecay, i);
   }
 }
 
-TYPED_TEST(RMSpropSolverTest, TestRMSpropLeastSquaresUpdateWithMomentum) {
+TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithEverything) {
   typedef typename TypeParam::Dtype Dtype;
   const Dtype kLearningRate = 0.01;
   const Dtype kWeightDecay = 0.1;
   const Dtype kMomentum = 0.9;
-  const Dtype kRmsdecay = 0.0;
+  const Dtype kRMSDecay = 0.0;
   const int kNumIters = 4;
   for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, kRmsdecay, i);
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum,
+        kRMSDecay, i);
   }
 }
 
-TYPED_TEST(RMSpropSolverTest, TestRMSpropLeastSquaresUpdateWithEverything) {
+TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
   typedef typename TypeParam::Dtype Dtype;
   const Dtype kLearningRate = 0.01;
   const Dtype kWeightDecay = 0.1;
   const Dtype kMomentum = 0.9;
-  const Dtype kRmsdecay = 0.95;
+  const Dtype kRMSDecay = 0.0;
   const int kNumIters = 4;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, kRmsdecay,i);
-  }
->>>>>>> Implement RMSprop
+  const int kIterSize = 2;
+  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kRMSDecay,
+      kNumIters, kIterSize);
 }
 
 template <typename TypeParam>
-class NesterovSolverTest : public GradientBasedSolverTest<TypeParam> {
+class RMSPropSolverTest : public GradientBasedSolverTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;
 
  protected:
   virtual void InitSolver(const SolverParameter& param) {
-    this->solver_.reset(new NesterovSolver<Dtype>(param));
+    this->solver_.reset(new RMSPropSolver<Dtype>(param));
   }
   virtual SolverParameter_SolverType solver_type() {
-    return SolverParameter_SolverType_NESTEROV;
+    return SolverParameter_SolverType_RMSPROP;
   }
 };
 
-TYPED_TEST_CASE(NesterovSolverTest, TestDtypesAndDevices);
-
-TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdate) {
-  this->TestLeastSquaresUpdate();
-}
-
-TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateLROneTenth) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
-  this->TestLeastSquaresUpdate(kLearningRate);
-}
+TYPED_TEST_CASE(RMSPropSolverTest, TestDtypesAndDevices);
 
-TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithWeightDecay) {
+TYPED_TEST(RMSPropSolverTest, TestRMSPropLeastSquaresUpdateWithWeightDecay) {
   typedef typename TypeParam::Dtype Dtype;
   const Dtype kLearningRate = 1.0;
   const Dtype kWeightDecay = 0.5;
   this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay);
 }
 
-TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithMomentum) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 1.0;
-  const Dtype kWeightDecay = 0.0;
-  const Dtype kMomentum = 0.5;
-  const int kNumIters = 1;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, 0.0, i);
-  }
-}
-
-TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) {
+TYPED_TEST(RMSPropSolverTest, TestRMSPropLeastSquaresUpdateWithRmsDecay) {
   typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 1.0;
+  const Dtype kLearningRate = 0.01;
   const Dtype kWeightDecay = 0.0;
-  const Dtype kMomentum = 0.5;
+  const Dtype kMomentum = 0.0;
+  const Dtype kRMSDecay = 0.95;
   const int kNumIters = 4;
   for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, 0.0, i);
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum,
+        kRMSDecay, i);
   }
 }
 
-TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithEverything) {
+TYPED_TEST(RMSPropSolverTest, TestRMSPropLeastSquaresUpdateWithEverything) {
   typedef typename TypeParam::Dtype Dtype;
   const Dtype kLearningRate = 0.01;
   const Dtype kWeightDecay = 0.1;
-  const Dtype kMomentum = 0.9;
-  const Dtype kRmsDecay = 0.0;
+  const Dtype kMomentum = 0.0;
+  const Dtype kRMSDecay = 0.95;
   const int kNumIters = 4;
   for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, kRmsDecay, i);
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum,
+        kRMSDecay, i);
   }
 }
 
-TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
+TYPED_TEST(RMSPropSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
   typedef typename TypeParam::Dtype Dtype;
   const Dtype kLearningRate = 0.01;
   const Dtype kWeightDecay = 0.1;
-  const Dtype kMomentum = 0.9;
+  const Dtype kMomentum = 0.0;
+  const Dtype kRMSDecay = 0.95;
   const int kNumIters = 4;
   const int kIterSize = 2;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
+  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kRMSDecay,
+      kNumIters, kIterSize);
 }
 
 }  // namespace caffe