clab · neubig · Jul 14, 2017 · Jul 14, 2017 · Jul 14, 2017 · Jul 14, 2017
diff --git a/dynet/mp.h b/dynet/mp.h
@@ -259,7 +259,9 @@ namespace dynet {
           }
           if (do_update && trainer != nullptr) {
             shared_object->update_mutex.wait();
-            trainer->update(1.0 / counter); 
+            // TODO: The scaling was originally this
+            // trainer->update(1.0 / counter); 
+            trainer->update(); 
             shared_object->update_mutex.post();
           }
           if (batch_counter == header.report_frequency) {
@@ -334,7 +336,9 @@ namespace dynet {
             batch_loss += datum_loss;
             train_loss += datum_loss;
             if (++batch_counter == batch_size) {
-              trainer->update(1.0 / batch_size);
+              // TODO: The scaling was originally this
+              // trainer->update(1.0 / batch_size); 
+              trainer->update(); 
               batch_counter = 0;
             }
             data_processed++;

diff --git a/dynet/training.cc b/dynet/training.cc
diff --git a/dynet/training.h b/dynet/training.h
@@ -18,12 +18,12 @@
 #include "dynet/shadow-params.h"
 
 #define DYNET_TRAINER_DEFINE_DEV_IMPL() \
-  void update_params(real scale, real gscale, size_t idx) override; \
-  void update_lookup_params(real scale, real gscale, size_t idx, size_t lidx) override; \
-  void update_lookup_params(real scale, real gscale, size_t idx) override; \
+  void update_params(real gscale, size_t idx) override; \
+  void update_lookup_params(real gscale, size_t idx, size_t lidx) override; \
+  void update_lookup_params(real gscale, size_t idx) override; \
   template <class MyDevice> \
-  void update_rule_dev(const MyDevice & dev, real scale, real gscale, const std::vector<Tensor*> & values); \
-  void update_rule(real scale, real gscale, const std::vector<Tensor*> & values) override;
+  void update_rule_dev(const MyDevice & dev, real gscale, const std::vector<Tensor*> & values); \
+  void update_rule(real gscale, const std::vector<Tensor*> & values) override;
 
 namespace dynet {
 
@@ -39,21 +39,19 @@ struct Trainer {
    * \brief General constructor for a Trainer
    * 
    * \param m ParameterCollection to be trained
-   * \param e0 Initial learning rate
+   * \param learning_rate Initial learning rate
    * \param edecay Learning rate decay
    */
-  explicit Trainer(ParameterCollection& m, real e0, real edecay = 0.0) :
-    eta0(e0), eta(e0), eta_decay(edecay), epoch(), clipping_enabled(true), clip_threshold(5),
+  explicit Trainer(ParameterCollection& m, real learning_rate) :
+    learning_rate(learning_rate), clipping_enabled(true), clip_threshold(5),
     clips(), updates(), clips_since_status(), updates_since_status(), sparse_updates_enabled(true), aux_allocated(false), model(&m) {}
   virtual ~Trainer();
 
   /**
    * \brief Update parameters
    * \details Update the parameters according to the appropriate update rule
-   *
-   * \param scale The scaling factor for the gradients
    */
-  virtual void update(real scale = 1.0);
+  virtual void update();
 
   /**
    * \brief Update subset of parameters
@@ -65,36 +63,28 @@ struct Trainer {
    *
    * \param updated_params The parameter indices to be updated
    * \param updated_lookup_params The lookup parameter indices to be updated
-   * \param scale The scaling factor for the gradients
    */
-  void update(const std::vector<unsigned> & updated_params, const std::vector<unsigned> & updated_lookup_params, real scale = 1.0);
+  void update(const std::vector<unsigned> & updated_params, const std::vector<unsigned> & updated_lookup_params);
 
-  void update_epoch(real r = 1) {
-    epoch += r;
-    eta = eta0 / (1 + epoch * eta_decay);
-  }
+  void update_epoch(real r = 1.0);
 
   /**
    * \brief Clip gradient
    * \details If clipping is enabled and the gradient is too big, return the amount to
    *          scale the gradient by (otherwise 1)
    *
    *
-   * \param scale The clipping limit
    * \return The appropriate scaling factor
    */
-  float clip_gradients(real scale);
+  float clip_gradients();
 
   // TODO: This is unprotected temporarily until there is a better solution
   //       for serializing the weight decay when saving models
   // Rescale all the parameters handled by this model
   void rescale_and_reset_weight_decay();
 
-  // learning rates
-  real eta0;
-  real eta;
-  real eta_decay;
-  real epoch;
+  // learning rate
+  real learning_rate;
 
   // clipping
   bool clipping_enabled;
@@ -121,7 +111,7 @@ struct Trainer {
   bool aux_allocated;
 
   void status() {
-    std::cerr << "[epoch=" << epoch << " eta=" << eta << " clips=" << clips_since_status << " updates=" << updates_since_status << "] ";
+    std::cerr << "[lr=" << learning_rate << " clips=" << clips_since_status << " updates=" << updates_since_status << "] ";
     updates_since_status = clips_since_status = 0;
   }
 
@@ -137,15 +127,15 @@ struct Trainer {
    * \param gscale Gradient scale based on clipping
    * \param values Values specific to the particular update rule being implemented
    */
-  virtual void update_rule(real scale, real gscale, const std::vector<Tensor*> & values) = 0;
+  virtual void update_rule(real gscale, const std::vector<Tensor*> & values) = 0;
   /**
    * \brief Parameter update function
    *
    * \param scale Scale of the update (i.e. learning rate)
    * \param gscale Gradient scale based on clipping
    * \param idx The ID of the parameter to update
    */
-  virtual void update_params(real scale, real gscale, size_t idx) = 0;
+  virtual void update_params(real gscale, size_t idx) = 0;
   /**
    * \brief Sparse lookup parameter update function
    *
@@ -154,15 +144,15 @@ struct Trainer {
    * \param idx The ID of the parameter to update
    * \param lidx Index of the specific entry within the lookup parameter object
    */
-  virtual void update_lookup_params(real scale, real gscale, size_t idx, size_t lidx) = 0;
+  virtual void update_lookup_params(real gscale, size_t idx, size_t lidx) = 0;
   /**
    * \brief Dense lookup parameter update function
    *
    * \param scale Scale of the update (i.e. learning rate)
    * \param gscale Gradient scale based on clipping
    * \param idx The ID of the parameter to update
    */
-  virtual void update_lookup_params(real scale, real gscale, size_t idx) = 0;
+  virtual void update_lookup_params(real gscale, size_t idx) = 0;
 
 };
 
@@ -181,10 +171,10 @@ struct SimpleSGDTrainer : public Trainer {
    * \brief Constructor
    * 
    * \param m ParameterCollection to be trained
-   * \param e0 Initial learning rate
+   * \param learning_rate Initial learning rate
    * \param edecay Learning rate decay parameter.
    */
-  explicit SimpleSGDTrainer(ParameterCollection& m, real e0 = 0.1, real edecay = 0.0) : Trainer(m, e0, edecay) {}
+  explicit SimpleSGDTrainer(ParameterCollection& m, real learning_rate = 0.1) : Trainer(m, learning_rate) {}
  protected:
   DYNET_TRAINER_DEFINE_DEV_IMPL()
 private:
@@ -218,23 +208,23 @@ struct CyclicalSGDTrainer : public Trainer {
    * \brief Constructor
    *
    * \param m ParameterCollection to be trained
-   * \param e0_min Lower learning rate
-   * \param e0_max Upper learning rate
+   * \param learning_rate_min Lower learning rate
+   * \param learning_rate_max Upper learning rate
    * \param step_size Period of the triangular function in number of iterations (__not__ epochs). According to the original paper, this should be set around (2-8) x (training iterations in epoch)
    * \param gamma Learning rate upper bound decay parameter
    * \param edecay Learning rate decay parameter. Ideally you shouldn't use this with cyclical learning rate since decay is already handled by \f$\gamma\f$
    */
-  explicit CyclicalSGDTrainer(ParameterCollection& m, float e0_min = 0.01, float e0_max = 0.1, float step_size = 2000, float gamma = 0.0, float edecay = 0.0) : Trainer(m, e0_min, edecay), e_min(e0_min), e_max(e0_max), step_size(step_size), gamma(gamma), it(0) {}
-  void update(real scale = 1.0) override {
-    Trainer::update(scale);
+  explicit CyclicalSGDTrainer(ParameterCollection& m, float learning_rate_min = 0.01, float learning_rate_max = 0.1, float step_size = 2000, float gamma = 0.0, float edecay = 0.0) : Trainer(m, learning_rate_min), e_min(learning_rate_min), e_max(learning_rate_max), step_size(step_size), gamma(gamma), it(0) {}
+  void update() override {
+    Trainer::update();
     cyclic_update_eta();
   }
 protected:
   DYNET_TRAINER_DEFINE_DEV_IMPL()
   void cyclic_update_eta() {
     float cycle = std::floor(1 + ((float) it)  / (2 * step_size));
     float x = std::abs( ((float) it) / step_size - 2 * cycle + 1);
-    eta = e_min + ((1 - x) > 0 ? (e_max - e_min) * (1 - x) * (real)std::pow(gamma, it) : 0);
+    learning_rate = e_min + ((1 - x) > 0 ? (e_max - e_min) * (1 - x) * (real)std::pow(gamma, it) : 0);
     it++;
   }
   float e_min;
@@ -262,12 +252,12 @@ struct MomentumSGDTrainer : public Trainer {
    * \brief Constructor
    * 
    * \param m ParameterCollection to be trained
-   * \param e0 Initial learning rate
+   * \param learning_rate Initial learning rate
    * \param mom Momentum
    * \param edecay Learning rate decay parameter
    */
-  explicit MomentumSGDTrainer(ParameterCollection& m, real e0 = 0.01, real mom = 0.9, real edecay = 0.0) :
-    Trainer(m, e0, edecay), momentum(mom) {}
+  explicit MomentumSGDTrainer(ParameterCollection& m, real learning_rate = 0.01, real mom = 0.9) :
+    Trainer(m, learning_rate), momentum(mom) {}
 
 protected:
   DYNET_TRAINER_DEFINE_DEV_IMPL()
@@ -299,12 +289,12 @@ struct AdagradTrainer : public Trainer {
    * \brief Constructor
    * 
    * \param m ParameterCollection to be trained
-   * \param e0 Initial learning rate
+   * \param learning_rate Initial learning rate
    * \param eps Bias parameter \f$\epsilon\f$ in the adagrad formula
    * \param edecay Learning rate decay parameter
    */
-  explicit AdagradTrainer(ParameterCollection& m, real e0 = 0.1, real eps = 1e-20, real edecay = 0.0) :
-    Trainer(m, e0, edecay), epsilon(eps) {}
+  explicit AdagradTrainer(ParameterCollection& m, real learning_rate = 0.1, real eps = 1e-20) :
+    Trainer(m, learning_rate), epsilon(eps) {}
 protected:
   DYNET_TRAINER_DEFINE_DEV_IMPL()
   virtual void alloc_impl() override;
@@ -337,8 +327,8 @@ struct AdadeltaTrainer : public Trainer {
    * \param rho Update parameter for the moving average of updates in the numerator
    * \param edecay Learning rate decay parameter
    */
-  explicit AdadeltaTrainer(ParameterCollection& m, real eps = 1e-6, real rho = 0.95, real edecay = 0.0) :
-    Trainer(m, 1.0, edecay), epsilon(eps), rho(rho) {}
+  explicit AdadeltaTrainer(ParameterCollection& m, real eps = 1e-6, real rho = 0.95) :
+    Trainer(m, 1.0), epsilon(eps), rho(rho) {}
 protected:
   DYNET_TRAINER_DEFINE_DEV_IMPL()
   virtual void alloc_impl() override;
@@ -367,13 +357,13 @@ struct RMSPropTrainer : public Trainer {
    * \brief Constructor
    * 
    * \param m ParameterCollection to be trained
-   * \param e0 Initial learning rate
+   * \param learning_rate Initial learning rate
    * \param eps Bias parameter \f$\epsilon\f$ in the adagrad formula
    * \param rho Update parameter for the moving average (`rho = 0` is equivalent to using Adagrad)
    * \param edecay Learning rate decay parameter
    */
-  explicit RMSPropTrainer(ParameterCollection& m, real e0 = 0.1, real eps = 1e-20, real rho = 0.95, real edecay = 0.0) :
-    Trainer(m, e0, edecay), epsilon(eps), rho(rho) {}
+  explicit RMSPropTrainer(ParameterCollection& m, real learning_rate = 0.1, real eps = 1e-20, real rho = 0.95) :
+    Trainer(m, learning_rate), epsilon(eps), rho(rho) {}
 protected:
   DYNET_TRAINER_DEFINE_DEV_IMPL()
   virtual void alloc_impl() override;
@@ -401,14 +391,14 @@ struct AdamTrainer : public Trainer {
    * \brief Constructor
    * 
    * \param m ParameterCollection to be trained
-   * \param e0 Initial learning rate
+   * \param learning_rate Initial learning rate
    * \param beta_1 Moving average parameter for the mean
    * \param beta_2 Moving average parameter for the variance
    * \param eps Bias parameter \f$\epsilon\f$
    * \param edecay Learning rate decay parameter
    */
-  explicit AdamTrainer(ParameterCollection& m, float e0 = 0.001, float beta_1 = 0.9, float beta_2 = 0.999, float eps = 1e-8, real edecay = 0.0) :
-    Trainer(m, e0, edecay), beta_1(beta_1), beta_2(beta_2), epsilon(eps) {}
+  explicit AdamTrainer(ParameterCollection& m, float learning_rate = 0.001, float beta_1 = 0.9, float beta_2 = 0.999, float eps = 1e-8) :
+    Trainer(m, learning_rate), beta_1(beta_1), beta_2(beta_2), epsilon(eps) {}
 
 protected:
   DYNET_TRAINER_DEFINE_DEV_IMPL()
@@ -435,26 +425,26 @@ struct AdamTrainer : public Trainer {
  *   
 */
 struct EGTrainer : public Trainer {
-  explicit EGTrainer(ParameterCollection& mod, real e0 = 0.1, real mom = 0.9, real ne = 0.0, real edecay = 0.0)
-    : Trainer(mod, e0, edecay), momentum(mom), isCyclical(false) {
+  explicit EGTrainer(ParameterCollection& mod, real learning_rate = 0.1, real mom = 0.9, real ne = 0.0)
+    : Trainer(mod, learning_rate), momentum(mom), isCyclical(false) {
     zeg.d = meg.d = {1};
     zeg.device = meg.device = default_device;
     default_device->allocate_tensor(DeviceMempool::PS, zeg);
     default_device->allocate_tensor(DeviceMempool::PS, meg);
   }
 
 //-----------------------------------------------------------------------------------------
-  void enableCyclicalLR(float _e0_min = 0.01, float _e0_max = 0.1, float _step_size = 2000, float _gamma = 0.0){
+  void enableCyclicalLR(float _learning_rate_min = 0.01, float _learning_rate_max = 0.1, float _step_size = 2000, float _gamma = 0.0){
     isCyclical = true;
-    e_min = _e0_min;
-    e_max = _e0_max;
+    e_min = _learning_rate_min;
+    e_max = _learning_rate_max;
     step_size = _step_size;
     gamma = _gamma;
     it = 0;
   }
 
-  virtual void update(real scale = 1.0) override { 
-    Trainer::update(scale); 
+  virtual void update() override { 
+    Trainer::update(); 
     if (isCyclical) cyclic_update_eta();
   }
 //-----------------------------------------------------------------------------------------
@@ -473,7 +463,7 @@ struct EGTrainer : public Trainer {
   void cyclic_update_eta() {
     float cycle = std::floor(1 + ((float) it)  / (2 * step_size));
     float x = std::abs( ((float) it) / step_size - 2 * cycle + 1);
-    eta = e_min + ((1 - x) > 0 ? (e_max - e_min) * (1 - x) * (real) std::pow(gamma, it) : 0);
+    learning_rate = e_min + ((1 - x) > 0 ? (e_max - e_min) * (1 - x) * (real) std::pow(gamma, it) : 0);
     it++;
   }
 

diff --git a/examples/cpp/imdb/train_imdb.cc b/examples/cpp/imdb/train_imdb.cc
@@ -282,7 +282,7 @@ int main(int argc, char** argv) {
       Expression loss_expr = engine.objective(cg, inst, logits);
       loss += as_scalar(cg.forward(loss_expr));
       cg.backward(loss_expr);
-      sgd->update(1.0);
+      sgd->update();
       ++lines;
       ++ttags;
     }

diff --git a/examples/cpp/mlc/train_mlc.cc b/examples/cpp/mlc/train_mlc.cc
@@ -130,8 +130,7 @@ int main(int argc, char** argv) {
 
   //AdadeltaTrainer sgd(m);
   SimpleSGDTrainer sgd(m);
-  sgd.eta0 = 0.001;
-  sgd.eta = 0.001;
+  sgd.learning_rate = 0.001;
 
   unsigned report_every_i = 50;
   unsigned si = train.size();
@@ -171,7 +170,7 @@ int main(int argc, char** argv) {
       Expression loss_expr = sparsemax_loss(u, &xy.labels);
       loss += as_scalar(cg.forward(loss_expr));
       cg.backward(loss_expr);
-      sgd.update(1.0);
+      sgd.update();
     }
     cerr << "[epoch=" << (ti / train.size()) << "] E=" << (loss / instances) << ' ';
   }

diff --git a/examples/cpp/nlm/train_nlm.cc b/examples/cpp/nlm/train_nlm.cc
@@ -75,7 +75,7 @@ int main(int argc, char** argv) {
       loss += as_scalar(cg.forward(nerr));
       cg.backward(nerr);
       ++n;
-      sgd.update(1.0);
+      sgd.update();
       if (n == 2500) break;
     }
     loss /= n;

diff --git a/examples/cpp/read-write/train_read-write.cc b/examples/cpp/read-write/train_read-write.cc
@@ -58,7 +58,7 @@ class XORModel {
 
     float return_loss = as_scalar(cg.forward(loss));
     cg.backward(loss);
-    sgd.update(1.0);
+    sgd.update();
     return return_loss;
   }
 

diff --git a/examples/cpp/rnnlm-batch-nce/train_rnnlm-batch-nce.cc b/examples/cpp/rnnlm-batch-nce/train_rnnlm-batch-nce.cc
@@ -215,6 +215,5 @@ int main(int argc, char** argv) {
       cg.backward(loss_exp);
       trainer.update();
     }
-    trainer.update_epoch(1.0);
   }
 }
diff --git a/examples/cpp/rnnlm/train_rnnlm.cc b/examples/cpp/rnnlm/train_rnnlm.cc
@@ -202,7 +202,7 @@ int main(int argc, char** argv) {
   }
 
   Trainer* sgd = new SimpleSGDTrainer(model);
-  sgd->eta0 = sgd->eta = params.eta0;
+  sgd->learning_rate = params.eta0;
   RNNLanguageModel<LSTMBuilder> lm(model);
 
   bool has_model_to_load = params.model_file != "";
@@ -269,7 +269,7 @@ int main(int argc, char** argv) {
           cerr << "**SHUFFLE\n";
           completed_epoch++;
           if (eta_decay_onset_epoch && completed_epoch >= (int)eta_decay_onset_epoch)
-            sgd->eta *= eta_decay_rate;
+            sgd->learning_rate *= eta_decay_rate;
           shuffle(order.begin(), order.end(), *rndeng);
         }
 
@@ -285,7 +285,7 @@ int main(int argc, char** argv) {
         ++lines;
       }
       report++;
-      cerr << '#' << report << " [epoch=" << (lines / training.size()) << " eta=" << sgd->eta << "] E = " << (loss / chars) << " ppl=" << exp(loss / chars) << ' ';
+      cerr << '#' << report << " [epoch=" << (lines / training.size()) << " lr=" << sgd->learning_rate << "] E = " << (loss / chars) << " ppl=" << exp(loss / chars) << ' ';
 
       // show score on dev data?
       if (report % dev_every_i_reports == 0) {