Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify training interface by removing weight decay and scaling #695

Merged
merged 5 commits into from
Jul 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions dynet/mp.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,9 @@ namespace dynet {
}
if (do_update && trainer != nullptr) {
shared_object->update_mutex.wait();
trainer->update(1.0 / counter);
// TODO: The scaling was originally this
// trainer->update(1.0 / counter);
trainer->update();
shared_object->update_mutex.post();
}
if (batch_counter == header.report_frequency) {
Expand Down Expand Up @@ -334,7 +336,9 @@ namespace dynet {
batch_loss += datum_loss;
train_loss += datum_loss;
if (++batch_counter == batch_size) {
trainer->update(1.0 / batch_size);
// TODO: The scaling was originally this
// trainer->update(1.0 / batch_size);
trainer->update();
batch_counter = 0;
}
data_processed++;
Expand Down
178 changes: 91 additions & 87 deletions dynet/training.cc

Large diffs are not rendered by default.

108 changes: 49 additions & 59 deletions dynet/training.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@
#include "dynet/shadow-params.h"

#define DYNET_TRAINER_DEFINE_DEV_IMPL() \
void update_params(real scale, real gscale, size_t idx) override; \
void update_lookup_params(real scale, real gscale, size_t idx, size_t lidx) override; \
void update_lookup_params(real scale, real gscale, size_t idx) override; \
void update_params(real gscale, size_t idx) override; \
void update_lookup_params(real gscale, size_t idx, size_t lidx) override; \
void update_lookup_params(real gscale, size_t idx) override; \
template <class MyDevice> \
void update_rule_dev(const MyDevice & dev, real scale, real gscale, const std::vector<Tensor*> & values); \
void update_rule(real scale, real gscale, const std::vector<Tensor*> & values) override;
void update_rule_dev(const MyDevice & dev, real gscale, const std::vector<Tensor*> & values); \
void update_rule(real gscale, const std::vector<Tensor*> & values) override;

namespace dynet {

Expand All @@ -39,21 +39,19 @@ struct Trainer {
* \brief General constructor for a Trainer
*
* \param m ParameterCollection to be trained
* \param e0 Initial learning rate
* \param learning_rate Initial learning rate
* \param edecay Learning rate decay
*/
explicit Trainer(ParameterCollection& m, real e0, real edecay = 0.0) :
eta0(e0), eta(e0), eta_decay(edecay), epoch(), clipping_enabled(true), clip_threshold(5),
explicit Trainer(ParameterCollection& m, real learning_rate) :
learning_rate(learning_rate), clipping_enabled(true), clip_threshold(5),
clips(), updates(), clips_since_status(), updates_since_status(), sparse_updates_enabled(true), aux_allocated(false), model(&m) {}
virtual ~Trainer();

/**
* \brief Update parameters
* \details Update the parameters according to the appropriate update rule
*
* \param scale The scaling factor for the gradients
*/
virtual void update(real scale = 1.0);
virtual void update();

/**
* \brief Update subset of parameters
Expand All @@ -65,36 +63,28 @@ struct Trainer {
*
* \param updated_params The parameter indices to be updated
* \param updated_lookup_params The lookup parameter indices to be updated
* \param scale The scaling factor for the gradients
*/
void update(const std::vector<unsigned> & updated_params, const std::vector<unsigned> & updated_lookup_params, real scale = 1.0);
void update(const std::vector<unsigned> & updated_params, const std::vector<unsigned> & updated_lookup_params);

void update_epoch(real r = 1) {
epoch += r;
eta = eta0 / (1 + epoch * eta_decay);
}
void update_epoch(real r = 1.0);

/**
* \brief Clip gradient
* \details If clipping is enabled and the gradient is too big, return the amount to
* scale the gradient by (otherwise 1)
*
*
* \param scale The clipping limit
* \return The appropriate scaling factor
*/
float clip_gradients(real scale);
float clip_gradients();

// TODO: This is unprotected temporarily until there is a better solution
// for serializing the weight decay when saving models
// Rescale all the parameters handled by this model
void rescale_and_reset_weight_decay();

// learning rates
real eta0;
real eta;
real eta_decay;
real epoch;
// learning rate
real learning_rate;

// clipping
bool clipping_enabled;
Expand All @@ -121,7 +111,7 @@ struct Trainer {
bool aux_allocated;

void status() {
std::cerr << "[epoch=" << epoch << " eta=" << eta << " clips=" << clips_since_status << " updates=" << updates_since_status << "] ";
std::cerr << "[lr=" << learning_rate << " clips=" << clips_since_status << " updates=" << updates_since_status << "] ";
updates_since_status = clips_since_status = 0;
}

Expand All @@ -137,15 +127,15 @@ struct Trainer {
* \param gscale Gradient scale based on clipping
* \param values Values specific to the particular update rule being implemented
*/
virtual void update_rule(real scale, real gscale, const std::vector<Tensor*> & values) = 0;
virtual void update_rule(real gscale, const std::vector<Tensor*> & values) = 0;
/**
* \brief Parameter update function
*
* \param scale Scale of the update (i.e. learning rate)
* \param gscale Gradient scale based on clipping
* \param idx The ID of the parameter to update
*/
virtual void update_params(real scale, real gscale, size_t idx) = 0;
virtual void update_params(real gscale, size_t idx) = 0;
/**
* \brief Sparse lookup parameter update function
*
Expand All @@ -154,15 +144,15 @@ struct Trainer {
* \param idx The ID of the parameter to update
* \param lidx Index of the specific entry within the lookup parameter object
*/
virtual void update_lookup_params(real scale, real gscale, size_t idx, size_t lidx) = 0;
virtual void update_lookup_params(real gscale, size_t idx, size_t lidx) = 0;
/**
* \brief Dense lookup parameter update function
*
* \param scale Scale of the update (i.e. learning rate)
* \param gscale Gradient scale based on clipping
* \param idx The ID of the parameter to update
*/
virtual void update_lookup_params(real scale, real gscale, size_t idx) = 0;
virtual void update_lookup_params(real gscale, size_t idx) = 0;

};

Expand All @@ -181,10 +171,10 @@ struct SimpleSGDTrainer : public Trainer {
* \brief Constructor
*
* \param m ParameterCollection to be trained
* \param e0 Initial learning rate
* \param learning_rate Initial learning rate
* \param edecay Learning rate decay parameter.
*/
explicit SimpleSGDTrainer(ParameterCollection& m, real e0 = 0.1, real edecay = 0.0) : Trainer(m, e0, edecay) {}
explicit SimpleSGDTrainer(ParameterCollection& m, real learning_rate = 0.1) : Trainer(m, learning_rate) {}
protected:
DYNET_TRAINER_DEFINE_DEV_IMPL()
private:
Expand Down Expand Up @@ -218,23 +208,23 @@ struct CyclicalSGDTrainer : public Trainer {
* \brief Constructor
*
* \param m ParameterCollection to be trained
* \param e0_min Lower learning rate
* \param e0_max Upper learning rate
* \param learning_rate_min Lower learning rate
* \param learning_rate_max Upper learning rate
* \param step_size Period of the triangular function in number of iterations (__not__ epochs). According to the original paper, this should be set around (2-8) x (training iterations in epoch)
* \param gamma Learning rate upper bound decay parameter
* \param edecay Learning rate decay parameter. Ideally you shouldn't use this with cyclical learning rate since decay is already handled by \f$\gamma\f$
*/
explicit CyclicalSGDTrainer(ParameterCollection& m, float e0_min = 0.01, float e0_max = 0.1, float step_size = 2000, float gamma = 0.0, float edecay = 0.0) : Trainer(m, e0_min, edecay), e_min(e0_min), e_max(e0_max), step_size(step_size), gamma(gamma), it(0) {}
void update(real scale = 1.0) override {
Trainer::update(scale);
explicit CyclicalSGDTrainer(ParameterCollection& m, float learning_rate_min = 0.01, float learning_rate_max = 0.1, float step_size = 2000, float gamma = 0.0, float edecay = 0.0) : Trainer(m, learning_rate_min), e_min(learning_rate_min), e_max(learning_rate_max), step_size(step_size), gamma(gamma), it(0) {}
void update() override {
Trainer::update();
cyclic_update_eta();
}
protected:
DYNET_TRAINER_DEFINE_DEV_IMPL()
void cyclic_update_eta() {
float cycle = std::floor(1 + ((float) it) / (2 * step_size));
float x = std::abs( ((float) it) / step_size - 2 * cycle + 1);
eta = e_min + ((1 - x) > 0 ? (e_max - e_min) * (1 - x) * (real)std::pow(gamma, it) : 0);
learning_rate = e_min + ((1 - x) > 0 ? (e_max - e_min) * (1 - x) * (real)std::pow(gamma, it) : 0);
it++;
}
float e_min;
Expand Down Expand Up @@ -262,12 +252,12 @@ struct MomentumSGDTrainer : public Trainer {
* \brief Constructor
*
* \param m ParameterCollection to be trained
* \param e0 Initial learning rate
* \param learning_rate Initial learning rate
* \param mom Momentum
* \param edecay Learning rate decay parameter
*/
explicit MomentumSGDTrainer(ParameterCollection& m, real e0 = 0.01, real mom = 0.9, real edecay = 0.0) :
Trainer(m, e0, edecay), momentum(mom) {}
explicit MomentumSGDTrainer(ParameterCollection& m, real learning_rate = 0.01, real mom = 0.9) :
Trainer(m, learning_rate), momentum(mom) {}

protected:
DYNET_TRAINER_DEFINE_DEV_IMPL()
Expand Down Expand Up @@ -299,12 +289,12 @@ struct AdagradTrainer : public Trainer {
* \brief Constructor
*
* \param m ParameterCollection to be trained
* \param e0 Initial learning rate
* \param learning_rate Initial learning rate
* \param eps Bias parameter \f$\epsilon\f$ in the adagrad formula
* \param edecay Learning rate decay parameter
*/
explicit AdagradTrainer(ParameterCollection& m, real e0 = 0.1, real eps = 1e-20, real edecay = 0.0) :
Trainer(m, e0, edecay), epsilon(eps) {}
explicit AdagradTrainer(ParameterCollection& m, real learning_rate = 0.1, real eps = 1e-20) :
Trainer(m, learning_rate), epsilon(eps) {}
protected:
DYNET_TRAINER_DEFINE_DEV_IMPL()
virtual void alloc_impl() override;
Expand Down Expand Up @@ -337,8 +327,8 @@ struct AdadeltaTrainer : public Trainer {
* \param rho Update parameter for the moving average of updates in the numerator
* \param edecay Learning rate decay parameter
*/
explicit AdadeltaTrainer(ParameterCollection& m, real eps = 1e-6, real rho = 0.95, real edecay = 0.0) :
Trainer(m, 1.0, edecay), epsilon(eps), rho(rho) {}
explicit AdadeltaTrainer(ParameterCollection& m, real eps = 1e-6, real rho = 0.95) :
Trainer(m, 1.0), epsilon(eps), rho(rho) {}
protected:
DYNET_TRAINER_DEFINE_DEV_IMPL()
virtual void alloc_impl() override;
Expand Down Expand Up @@ -367,13 +357,13 @@ struct RMSPropTrainer : public Trainer {
* \brief Constructor
*
* \param m ParameterCollection to be trained
* \param e0 Initial learning rate
* \param learning_rate Initial learning rate
* \param eps Bias parameter \f$\epsilon\f$ in the adagrad formula
* \param rho Update parameter for the moving average (`rho = 0` is equivalent to using Adagrad)
* \param edecay Learning rate decay parameter
*/
explicit RMSPropTrainer(ParameterCollection& m, real e0 = 0.1, real eps = 1e-20, real rho = 0.95, real edecay = 0.0) :
Trainer(m, e0, edecay), epsilon(eps), rho(rho) {}
explicit RMSPropTrainer(ParameterCollection& m, real learning_rate = 0.1, real eps = 1e-20, real rho = 0.95) :
Trainer(m, learning_rate), epsilon(eps), rho(rho) {}
protected:
DYNET_TRAINER_DEFINE_DEV_IMPL()
virtual void alloc_impl() override;
Expand Down Expand Up @@ -401,14 +391,14 @@ struct AdamTrainer : public Trainer {
* \brief Constructor
*
* \param m ParameterCollection to be trained
* \param e0 Initial learning rate
* \param learning_rate Initial learning rate
* \param beta_1 Moving average parameter for the mean
* \param beta_2 Moving average parameter for the variance
* \param eps Bias parameter \f$\epsilon\f$
* \param edecay Learning rate decay parameter
*/
explicit AdamTrainer(ParameterCollection& m, float e0 = 0.001, float beta_1 = 0.9, float beta_2 = 0.999, float eps = 1e-8, real edecay = 0.0) :
Trainer(m, e0, edecay), beta_1(beta_1), beta_2(beta_2), epsilon(eps) {}
explicit AdamTrainer(ParameterCollection& m, float learning_rate = 0.001, float beta_1 = 0.9, float beta_2 = 0.999, float eps = 1e-8) :
Trainer(m, learning_rate), beta_1(beta_1), beta_2(beta_2), epsilon(eps) {}

protected:
DYNET_TRAINER_DEFINE_DEV_IMPL()
Expand All @@ -435,26 +425,26 @@ struct AdamTrainer : public Trainer {
*
*/
struct EGTrainer : public Trainer {
explicit EGTrainer(ParameterCollection& mod, real e0 = 0.1, real mom = 0.9, real ne = 0.0, real edecay = 0.0)
: Trainer(mod, e0, edecay), momentum(mom), isCyclical(false) {
explicit EGTrainer(ParameterCollection& mod, real learning_rate = 0.1, real mom = 0.9, real ne = 0.0)
: Trainer(mod, learning_rate), momentum(mom), isCyclical(false) {
zeg.d = meg.d = {1};
zeg.device = meg.device = default_device;
default_device->allocate_tensor(DeviceMempool::PS, zeg);
default_device->allocate_tensor(DeviceMempool::PS, meg);
}

//-----------------------------------------------------------------------------------------
void enableCyclicalLR(float _e0_min = 0.01, float _e0_max = 0.1, float _step_size = 2000, float _gamma = 0.0){
void enableCyclicalLR(float _learning_rate_min = 0.01, float _learning_rate_max = 0.1, float _step_size = 2000, float _gamma = 0.0){
isCyclical = true;
e_min = _e0_min;
e_max = _e0_max;
e_min = _learning_rate_min;
e_max = _learning_rate_max;
step_size = _step_size;
gamma = _gamma;
it = 0;
}

virtual void update(real scale = 1.0) override {
Trainer::update(scale);
virtual void update() override {
Trainer::update();
if (isCyclical) cyclic_update_eta();
}
//-----------------------------------------------------------------------------------------
Expand All @@ -473,7 +463,7 @@ struct EGTrainer : public Trainer {
void cyclic_update_eta() {
float cycle = std::floor(1 + ((float) it) / (2 * step_size));
float x = std::abs( ((float) it) / step_size - 2 * cycle + 1);
eta = e_min + ((1 - x) > 0 ? (e_max - e_min) * (1 - x) * (real) std::pow(gamma, it) : 0);
learning_rate = e_min + ((1 - x) > 0 ? (e_max - e_min) * (1 - x) * (real) std::pow(gamma, it) : 0);
it++;
}

Expand Down
2 changes: 1 addition & 1 deletion examples/cpp/imdb/train_imdb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ int main(int argc, char** argv) {
Expression loss_expr = engine.objective(cg, inst, logits);
loss += as_scalar(cg.forward(loss_expr));
cg.backward(loss_expr);
sgd->update(1.0);
sgd->update();
++lines;
++ttags;
}
Expand Down
5 changes: 2 additions & 3 deletions examples/cpp/mlc/train_mlc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,7 @@ int main(int argc, char** argv) {

//AdadeltaTrainer sgd(m);
SimpleSGDTrainer sgd(m);
sgd.eta0 = 0.001;
sgd.eta = 0.001;
sgd.learning_rate = 0.001;

unsigned report_every_i = 50;
unsigned si = train.size();
Expand Down Expand Up @@ -171,7 +170,7 @@ int main(int argc, char** argv) {
Expression loss_expr = sparsemax_loss(u, &xy.labels);
loss += as_scalar(cg.forward(loss_expr));
cg.backward(loss_expr);
sgd.update(1.0);
sgd.update();
}
cerr << "[epoch=" << (ti / train.size()) << "] E=" << (loss / instances) << ' ';
}
Expand Down
2 changes: 1 addition & 1 deletion examples/cpp/nlm/train_nlm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ int main(int argc, char** argv) {
loss += as_scalar(cg.forward(nerr));
cg.backward(nerr);
++n;
sgd.update(1.0);
sgd.update();
if (n == 2500) break;
}
loss /= n;
Expand Down
2 changes: 1 addition & 1 deletion examples/cpp/read-write/train_read-write.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class XORModel {

float return_loss = as_scalar(cg.forward(loss));
cg.backward(loss);
sgd.update(1.0);
sgd.update();
return return_loss;
}

Expand Down
1 change: 0 additions & 1 deletion examples/cpp/rnnlm-batch-nce/train_rnnlm-batch-nce.cc
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,5 @@ int main(int argc, char** argv) {
cg.backward(loss_exp);
trainer.update();
}
trainer.update_epoch(1.0);
}
}
6 changes: 3 additions & 3 deletions examples/cpp/rnnlm/train_rnnlm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ int main(int argc, char** argv) {
}

Trainer* sgd = new SimpleSGDTrainer(model);
sgd->eta0 = sgd->eta = params.eta0;
sgd->learning_rate = params.eta0;
RNNLanguageModel<LSTMBuilder> lm(model);

bool has_model_to_load = params.model_file != "";
Expand Down Expand Up @@ -269,7 +269,7 @@ int main(int argc, char** argv) {
cerr << "**SHUFFLE\n";
completed_epoch++;
if (eta_decay_onset_epoch && completed_epoch >= (int)eta_decay_onset_epoch)
sgd->eta *= eta_decay_rate;
sgd->learning_rate *= eta_decay_rate;
shuffle(order.begin(), order.end(), *rndeng);
}

Expand All @@ -285,7 +285,7 @@ int main(int argc, char** argv) {
++lines;
}
report++;
cerr << '#' << report << " [epoch=" << (lines / training.size()) << " eta=" << sgd->eta << "] E = " << (loss / chars) << " ppl=" << exp(loss / chars) << ' ';
cerr << '#' << report << " [epoch=" << (lines / training.size()) << " lr=" << sgd->learning_rate << "] E = " << (loss / chars) << " ppl=" << exp(loss / chars) << ' ';

// show score on dev data?
if (report % dev_every_i_reports == 0) {
Expand Down
Loading