-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimizer library #2190
Optimizer library #2190
Changes from all commits
26ad14f
7d884c1
c0ca125
48e85f2
043a589
bcbd264
4804376
c9c2ee8
ee4a0d1
de1c756
29c364f
b8f33bf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
include_directories(${CMAKE_CURRENT_BINARY_DIR}) | ||
add_subdirectory(optimizer) | ||
|
||
go_library(adder SRCS adder.go) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,7 +26,8 @@ const ( | |
type Parameter struct { | ||
Name string | ||
ElementType ElementType | ||
Content []byte | ||
Size uint32 | ||
// Content []byte | ||
} | ||
|
||
// ParameterWithConfig contains the parameter and the configuration. | ||
|
@@ -42,15 +43,16 @@ type Gradient Parameter | |
type Service struct { | ||
initialized chan struct{} | ||
|
||
mu sync.Mutex | ||
opt *optimizer | ||
paramMap map[string]Parameter | ||
mu sync.Mutex | ||
paramMap map[string]Parameter | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since there is already optimizerMap, and optimizer owns parameter. So maybe we no longer need paramMap? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agreed.fix the go part Done. |
||
optimizerMap map[string]*optimizer // per parameter to optmizer | ||
} | ||
|
||
// NewService creates a new service. | ||
func NewService() *Service { | ||
s := &Service{} | ||
s.paramMap = make(map[string]Parameter) | ||
s.optimizerMap = make(map[string]*optimizer) | ||
s.initialized = make(chan struct{}) | ||
return s | ||
} | ||
|
@@ -71,8 +73,9 @@ func (s *Service) BeginInitParams(config []byte, dummy *int) error { | |
s.opt.Cleanup() | ||
} | ||
|
||
// TODO(helin): parse learning rate from config | ||
s.opt = newOptimizer(sgd, 0.01) | ||
// TODO(h | ||
// elin): parse learning rate from config | ||
s.opt = newOptimizer(config OptimizerConfig) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have not see the definition of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix it, change to string |
||
return nil | ||
} | ||
|
||
|
@@ -135,7 +138,10 @@ func (s *Service) SendGrads(grads []Gradient, dummy *int) error { | |
errCh := make(chan error, count) | ||
for _, g := range grads { | ||
go func(p Parameter, g Gradient) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change introduces concurrent read to optimizerMap, go func(name string, g Gradient) {
s.mu.Lock()
defer s.mu.Unlock()
opt, err := s.optimizerMap[p.Name]
if err != nil {
err = opt.UpdateParameter(p, g)
}
} The above function locks the mutex until optimization is finished, which is safe, since concurrent update to optimizer is a race condition. But the performance will hurt, since there is only a single mutex per Service.
go func(o *Optimizer, g Gradient) {
err := o.UpdateParameter(g)
errCh <- err
}(s.optimizerMap[g.Name], g) // we are still protected by mutex when invoking s.optimizerMap[g.Name] There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for your kindly reminding! I'm agreed. I thought that we don't need to protect against concurrent update. |
||
err := s.opt.UpdateParameter(p, g) | ||
opt, err := s.optimizerMap[p.Name] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I noticed the code here does not compile, because We should not check in code that does not compile. Maybe I can do the Go code part for now, and you can get more familiar with Go by reviewing the PR? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry, I haven't written the cmake script in right way. I will check in code more carefully. Thanks for your go lint editor plugin and your commit in go cmake script, now I can run this part. Thanks! |
||
if err != nil { | ||
err := opt.UpdateParameter(p, g) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do not use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sadly I keep do my job in baidu's dev machine, last two days "jumbo(a package management system)" broken in whole baidu. It do not pass the go compile side. |
||
} | ||
errCh <- err | ||
}(s.paramMap[g.Name], g) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
include_directories(${CMAKE_CURRENT_BINARY_DIR}) | ||
|
||
set(OPITMIZER_SRCS | ||
adadelta_optimizer.cc | ||
adagrad_optimizer.cc | ||
adam_optimizer.cc | ||
optimizer.cc | ||
parameter_optimizer.cc | ||
sgd_optmizer.cc | ||
regularizer.cc | ||
) | ||
|
||
set(OPITMIZER_Headers | ||
adadelta_optimizer.h | ||
adagrad_optimizer.h | ||
adam_optimizer.h | ||
lr_policy.h | ||
optimizer.h | ||
parameter_optimizer.h | ||
regularizer.h | ||
sgd_optimizer.h | ||
Tensor.h | ||
) | ||
|
||
add_library(optimizer STATIC ${OPITMIZER_SRCS}) | ||
add_dependencies(optimizer gen_proto_cpp) | ||
|
||
add_simple_unittest(optimizer_test) | ||
add_simple_unittest(optimizer_factory_test) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#ifndef PADDLE_OPTIMIZER_TENSOR_H_ | ||
#define PADDLE_OPTIMIZER_TENSOR_H_ | ||
/** | ||
* @brief tensor used by optimizer | ||
*/ | ||
|
||
#include "paddle/math/BaseMatrix.h" | ||
#include <string.h> | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
template <class T> | ||
using TensorBase = BaseMatrixT<T>; | ||
|
||
template <class T> | ||
class Tensor : public TensorBase<T> { | ||
public: | ||
Tensor(T* data, int size) : TensorBase<T>(size, 1, 0, data, false, false) {} | ||
T* get_buffer() { return this->data_; } | ||
// TODO: replace with tensorshape | ||
size_t width() { | ||
return this->width_; | ||
} | ||
}; | ||
|
||
} // optimizer | ||
} // paddle | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#include "adadelta_optimizer.h" | ||
#include <algorithm> | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
template<class T> | ||
AdadeltaOptimizer<T>::AdadeltaOptimizer(const ::paddle::OptimizerConfig &config) : ParameterOptimizer<T>(config) { | ||
rho = config.adadelta().rho(); | ||
epsilon = config.adadelta().epsilon(); | ||
decay = config.adadelta().decay(); | ||
} | ||
|
||
template<class T> | ||
void AdadeltaOptimizer<T>::set_weight(const Tensor<T> *p) { | ||
size_t size = p->width(); | ||
T* gptr = new T[size]; | ||
accum_gradient = Tensor<T>(gptr, size); | ||
T* dptr = new T[size]; | ||
accum_delta = Tensor<T>(dtpr, size); | ||
T* dptr_current = new T[size]; | ||
update_delta = Tensor<T>(dptr_current, size); | ||
} | ||
|
||
template<class T> | ||
void AdadeltaOptimizer<T>::update(const Tensor<T> &gradient) { | ||
num_sample_passed += 1; | ||
double learning_rate = lr_policy->get_learning_rate(); | ||
for(size_t i=0; i<parameter_.size(); ++i) { | ||
accum_gradient[i] = rho * accum_gradient[i] + (1.0 - rho) * gradient[i] * gradient[i]; | ||
|
||
update_delta[i] = std::sqrt(accum_delta[i] + epsilon) / std::sqrt(accum_gradient[i] + epsilon) * gradient[i]; | ||
|
||
accum_delta[i] = rho * accum_delta[i] + (1.0-rho) * update_delta[i] * update_delta[i]; | ||
|
||
parameter_[i] -= update_delta[i] + decay*parameter_[i]; | ||
} | ||
} | ||
|
||
|
||
template class AdadeltaOptimizer<float>; | ||
template class AdadeltaOptimizer<double>; | ||
|
||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#ifndef PADDLE_ADADELTA_OPTIMIZER_H_ | ||
#define PADDLE_ADADELTA_OPTIMIZER_H_ | ||
|
||
#include "parameter_optimizer.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
template <class T> | ||
class AdadeltaOptimizer : public ParameterOptimizer<T> { | ||
public: | ||
AdadeltaOptimizer(const OptimizerConfig &config); | ||
~AdadeltaOptimizer(){ | ||
if(accum_gradient) delete accum_gradient; | ||
if(accum_delta) delete accum_delta; | ||
if(update_delta) delete update_delta; | ||
} | ||
void update(const Tensor<T> &gradient); | ||
void set_weight(const Tensor<T> *p); | ||
T* get_weight() const; | ||
|
||
private: | ||
Tensor<T> *accum_gradient; | ||
Tensor<T> *accum_delta; | ||
Tensor<T> *update_delta; | ||
|
||
double rho; | ||
double epsilon; | ||
double decay; | ||
}; | ||
|
||
} | ||
} | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#include "adagrad_optimizer.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
template<class T> | ||
AdagradOptimizer<T>::AdagradOptimizer(const ::paddle::OptimizerConfig &config) : ParameterOptimizer<T>(config) { | ||
epsilon = config.adagrad().epsilon(); | ||
decay = config.adagrad().decay(); | ||
} | ||
|
||
template<class T> | ||
void AdagradOptimizer<T>::set_weight(const Tensor<T> *p) { | ||
size_t size = p->width(); | ||
T* gptr = new T[size]; | ||
accum_gradient = Tensor<T>(gptr, size); | ||
T* dptr = new T[size]; | ||
accum_delta = Tensor<T>(dtpr, size); | ||
T* dptr_current = new T[size]; | ||
update_delta = Tensor<T>(dptr_current, size); | ||
} | ||
|
||
template<class T> | ||
void AdagradOptimizer<T>::update(const Tensor<T> &gradient) { | ||
num_sample_passed += 1; | ||
double learning_rate = lr_policy->get_learning_rate(); | ||
for(size_t i=0; i<parameter_.size(); ++i) { | ||
accum_gradient[i] += gradient[i] * gradient[i]; | ||
parameter_[i] += learning_rate * (gradient[i] / std::sqrt(accum_gradient[i] + epsilon) + decay * parameter_[i]); | ||
} | ||
} | ||
|
||
|
||
template class AdagradOptimizer<float>; | ||
template class AdagradOptimizer<double>; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#ifndef PADDLE_ADAGRAD_OPTIMIZER_H_ | ||
#define PADDLE_ADAGRAD_OPTIMIZER_H_ | ||
|
||
#include "parameter_optimizer.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
|
||
template <class T> | ||
class AdagradOptimizer : public ParameterOptimizer<T> { | ||
public: | ||
AdagradOptimizer(const OptimizerConfig &config); | ||
~AdagradOptimizer(){ | ||
if(accum_gradient) delete accum_gradient; | ||
} | ||
void update(const Tensor<T> &gradient); | ||
void set_weight(const Tensor<T> *p); | ||
T* get_weight() const; | ||
|
||
private: | ||
Tensor<T> *accum_gradient; | ||
double epsilon; | ||
double decay; | ||
}; | ||
|
||
} | ||
} | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#include "adam_optimizer.h" | ||
|
||
|
||
namespace paddle { | ||
namespace optimizer { | ||
template<class T> | ||
AdamOptimizer<T>::AdamOptimizer(const ::paddle::OptimizerConfig &config) : ParameterOptimizer<T>(config) { | ||
beta_1 = config.adam().beta_1(); | ||
beta_2 = config.adam().beta_2(); | ||
epsilon = config.adam().epsilon(); | ||
decay = config.adam().decay(); | ||
} | ||
|
||
template<class T> | ||
void AdamOptimizer<T>::set_weight(const Tensor<T> *p) { | ||
size_t size = p->width(); | ||
T* mptr = new T[size]; | ||
momentums_ = Tensor<T>(mptr, size); | ||
T* vptr = new T[size]; | ||
velocitys_ = Tensor<T>(vtpr, size); | ||
} | ||
|
||
template<class T> | ||
void AdamOptimizer<T>::update(const Tensor<T> &gradient) { | ||
num_sample_passed += 1; | ||
double learning_rate = lr_policy->get_learning_rate(); | ||
for(size_t i=0; i<parameter_.size(); ++i) { | ||
accum_gradient[i] += gradient[i] * gradient[i]; | ||
parameter_[i] += learning_rate * (gradient[i] / std::sqrt(accum_gradient[i] + epsilon) + decay * parameter_[i]); | ||
} | ||
} | ||
|
||
|
||
template class AdamOptimizer<float>; | ||
template class AdamOptimizer<double>; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#ifndef PADDLE_ADAM_OPTIMIZER_H_ | ||
#define PADDLE_ADAM_OPTIMIZER_H_ | ||
|
||
#include "parameter_optimizer.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
|
||
template <class T> | ||
class AdamOptimizer : public ParameterOptimizer<T> { | ||
public: | ||
AdamOptimizer(const OptimizerConfig &config); | ||
~AdamOptimizer(){} | ||
void update(const Tensor<T> &gradient); | ||
void set_weight(const Tensor<T> *p); | ||
T* get_weight() const; | ||
private: | ||
Tensor<T> *momentums_; | ||
Tensor<T> *velocitys_; | ||
double beta_1; | ||
double beta_2; | ||
double epsilon; | ||
double decay; | ||
}; | ||
|
||
|
||
} // namespace optimizer | ||
} // namespace paddle | ||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#ifndef PADDLE_OPTIMIZER_LR_POLICY_H_ | ||
#define PADDLE_OPTIMIZER_LR_POLICY_H_ | ||
|
||
#include "OptimizerConfig.ph.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
class BaseLr { | ||
public: | ||
LrPolicyBase(const OpitmizerConfig &config) { | ||
learning_rate = config.lr_config().learning_rate(); | ||
} | ||
virtual double get_learning_rate(const uint64_t num_sample_passed) = 0; | ||
private: | ||
double learning_rate; | ||
}; | ||
|
||
// constant learning rate policy | ||
class ConstLr final : public BaseLr { | ||
public: | ||
double get_learning_rate(const uint64_t num_sample_passed) { | ||
return learning_rate; | ||
} | ||
}; | ||
|
||
|
||
} | ||
} | ||
|
||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No commented out code please.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fix done.