Skip to content

Commit

Permalink
Revert "[GPUPS]support for Std adagrad (PaddlePaddle#64)"
Browse files Browse the repository at this point in the history
This reverts commit 0e31882.
  • Loading branch information
zmxdream committed Oct 26, 2022
1 parent 0e31882 commit 506e0c3
Show file tree
Hide file tree
Showing 8 changed files with 7 additions and 198 deletions.
12 changes: 0 additions & 12 deletions paddle/fluid/framework/data_feed.h
Original file line number Diff line number Diff line change
Expand Up @@ -923,7 +923,6 @@ class DataFeed {
virtual const std::vector<std::string>& GetInsIdVec() const {
return ins_id_vec_;
}
virtual void SetInsIdVec(MiniBatchGpuPack* pack) {}
virtual const std::vector<std::string>& GetInsContentVec() const {
return ins_content_vec_;
}
Expand Down Expand Up @@ -1492,17 +1491,6 @@ class SlotRecordInMemoryDataFeed : public InMemoryDataFeed<SlotRecord> {
// CustomParser* parser) {}
virtual void PutToFeedVec(const std::vector<SlotRecord>& ins_vec) {}

virtual void SetInsIdVec(MiniBatchGpuPack* pack) {
if (parse_ins_id_) {
size_t ins_num = pack->ins_num();
ins_id_vec_.clear();
ins_id_vec_.resize(ins_num);
for(size_t i = 0; i < ins_num; i++) {
ins_id_vec_[i] = pack->get_lineid(i);
}
}
}

virtual void LoadIntoMemoryByCommand(void);
virtual void LoadIntoMemoryByLib(void);
virtual void LoadIntoMemoryByLine(void);
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/device_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ void DeviceWorker::DumpField(const Scope& scope, int dump_mode,
continue;
}
ars[i] += ins_id_vec[i];
if (ins_content_vec.size() > i) ars[i] = ars[i] + "\t" + ins_content_vec[i];
ars[i] = ars[i] + "\t" + ins_content_vec[i];
}
for (auto& field : *dump_fields_) {
Variable* var = scope.FindVar(field);
Expand Down
18 changes: 5 additions & 13 deletions paddle/fluid/framework/fleet/heter_ps/feature_value.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,11 @@ class CommonFeatureValueAccessor {

// 根据mf_dim计算的总长度
__host__ __device__ int Dim(int mf_dim) {
int tmp_embedx_sgd_dim = 1; // shared adagrad
int tmp_embedx_sgd_dim = 1;
if (mf_optimizer_type_ == 3) {//adam
tmp_embedx_sgd_dim = mf_dim * 2 + 2;
} else if (mf_optimizer_type_ == 4) { //shared_adam
tmp_embedx_sgd_dim = 4;
} else if (mf_optimizer_type_ == 2) { // std adagrad
tmp_embedx_sgd_dim = mf_dim;
}
return 9 + embed_sgd_dim + tmp_embedx_sgd_dim + mf_dim;
}
Expand All @@ -185,13 +183,11 @@ class CommonFeatureValueAccessor {

// 根据mf_dim 计算的 mf_size byte数
__host__ __device__ int MFSize(int mf_dim) {
int tmp_embedx_sgd_dim = 1; // shared adagrad
int tmp_embedx_sgd_dim = 1;
if (mf_optimizer_type_ == 3) { //adam
tmp_embedx_sgd_dim = mf_dim * 2 + 2;
} else if (mf_optimizer_type_ == 4) { //shared_adam
tmp_embedx_sgd_dim = 4;
} else if (mf_optimizer_type_ = 2) { // std adagrad
tmp_embedx_sgd_dim = mf_dim;
}
return (tmp_embedx_sgd_dim + mf_dim) * sizeof(float);
}
Expand All @@ -200,14 +196,12 @@ class CommonFeatureValueAccessor {

__host__ __device__ int EmbedxWOffsetIndex(float* val) {
// has mf
int tmp_embedx_sgd_dim = 1; // shared adagrad
int tmp_embedx_sgd_dim = 1;
if ((int)MfSize(val) > 0) {
if (mf_optimizer_type_ == 3) {//adam
tmp_embedx_sgd_dim = int(MfDim(val)) * 2 + 2;
} else if (mf_optimizer_type_ == 4) { //shared_adam
tmp_embedx_sgd_dim = 4;
} else if (mf_optimizer_type_ == 2) { // std adagrad
tmp_embedx_sgd_dim = int(MfDim(val));
}
return EmbedxG2SumIndex() + tmp_embedx_sgd_dim;
} else {
Expand Down Expand Up @@ -303,6 +297,7 @@ class CommonFeatureValueAccessor {
}
};


__host__ __device__ CommonFeatureValueAccessor() {}
__host__ __device__ ~CommonFeatureValueAccessor() {}

Expand Down Expand Up @@ -345,10 +340,7 @@ class CommonFeatureValueAccessor {
} else if (optimizer_type == 4) { //shared_adam
common_feature_value.embed_sgd_dim = 4;
common_feature_value.embedx_sgd_dim = 4;
} else if (optimizer_type == 2) { // std adagrad
common_feature_value.embed_sgd_dim = 1;
common_feature_value.embedx_sgd_dim = sparse_embedx_dim;
} else if (optimizer_type == 1) { // shared adagrad
} else {
common_feature_value.embed_sgd_dim = 1;
common_feature_value.embedx_sgd_dim = 1;
}
Expand Down
8 changes: 1 addition & 7 deletions paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,12 @@ HeterPsBase* HeterPsBase::get_instance(
// NOTE(zhangminxu): gpups' sparse table optimizer type,
// now only support embed&embedx 's sparse optimizer is the same
// we will support using diff optimizer for embed&embedx
if (accessor_type == "DownpourCtrDymfAccessor" && optimizer_type == 1) { // optimizer_type == 1 means adagrad
auto* accessor_wrapper_ptr =
GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
if (accessor_type == "DownpourCtrDymfAccessor" && optimizer_type == 1) { // optimizer_type == 1 means adagrad
// auto* accessor_wrapper_ptr =
// GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
CommonFeatureValueAccessor* gpu_accessor =
((AccessorWrapper<CommonFeatureValueAccessor>*)accessor_wrapper_ptr)->AccessorPtr();
return new HeterPs<CommonFeatureValueAccessor, SparseAdagradOptimizer>(capacity, resource, *gpu_accessor);
} else if (accessor_type == "DownpourCtrDymfAccessor" && optimizer_type == 2) { // std_adagrad
CommonFeatureValueAccessor* gpu_accessor =
((AccessorWrapper<CommonFeatureValueAccessor>*)accessor_wrapper_ptr)->AccessorPtr();
return new HeterPs<CommonFeatureValueAccessor, StdAdagradOptimizer>(capacity, resource, *gpu_accessor);
} else {
CHECK(0) << " HeterPsBase get_instance Warning: now only support "
"DownpourCtrDymfAccessor && SparseAdagradOptimizer, but get accessor_type:"
Expand Down
153 changes: 0 additions & 153 deletions paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,159 +162,6 @@ class SparseAdagradOptimizer {
GPUAccessor gpu_accessor_;
};

template <typename GPUAccessor>
class StdAdagradOptimizer {

public:
StdAdagradOptimizer() {}
StdAdagradOptimizer(GPUAccessor& gpu_accessor) {
gpu_accessor_ = gpu_accessor;
}

~StdAdagradOptimizer() {}

__device__ void update_lr(const OptimizerConfig& optimizer_config, float& w, float& g2sum, float g, float scale) {
// double add_g2sum = 0;
double ratio = optimizer_config.learning_rate *
sqrt(optimizer_config.initial_g2sum /
(optimizer_config.initial_g2sum + g2sum));
double scaled_grad = g / scale;

w += scaled_grad * ratio;

if (w < optimizer_config.min_bound) w = optimizer_config.min_bound;
if (w > optimizer_config.max_bound) w = optimizer_config.max_bound;

g2sum += scaled_grad * scaled_grad;

// g2sum += add_g2sum;
}

__device__ int g2sum_index() {
return 0;
}

__device__ void update_mf(const OptimizerConfig& optimizer_config, int n, float* w, float* sgd, const float* g,
float scale) {
// double add_g2sum = 0;
// double ratio = optimizer_config.mf_learning_rate *
// sqrt(optimizer_config.mf_initial_g2sum /
// (optimizer_config.mf_initial_g2sum + g2sum));
for (int i = 0; i < n; ++i) {
float& g2sum = sgd[g2sum_index() + i];
double scaled_grad = g[i] / scale;

double ratio = optimizer_config.mf_learning_rate *
sqrt(optimizer_config.mf_initial_g2sum /
(optimizer_config.mf_initial_g2sum + g2sum));

w[i] += scaled_grad * ratio;

if (w[i] < optimizer_config.mf_min_bound)
w[i] = optimizer_config.mf_min_bound;
if (w[i] > optimizer_config.mf_max_bound)
w[i] = optimizer_config.mf_max_bound;

g2sum += scaled_grad * scaled_grad;
}

// g2sum += add_g2sum / n;
}

/*
__device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, ValType* ptr, const GradType* grad) {
ptr->slot = grad.slot;
ptr->show += grad.show;
ptr->clk += grad.clk;
ptr->delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) +
optimizer_config.clk_coeff * grad.clk;
update_lr(optimizer_config, ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show);
// ptr->mf_dim = grad.mf_dim;
if (ptr->mf_size == 0) {
if (optimizer_config.mf_create_thresholds <=
optimizer_config.nonclk_coeff * (ptr->show - ptr->clk) +
optimizer_config.clk_coeff * ptr->clk) {
ptr->mf_size = ptr->mf_dim + 1;
ptr->mf[0] = 0;
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
curandState state;
curand_init(clock64(), tid_x, 0, &state);
for (int i = 0; i < ptr->mf_dim; ++i) {
ptr->mf[i + 1] =
(curand_uniform(&state)) * optimizer_config.mf_initial_range;
}
}
} else {
update_mf(optimizer_config, ptr->mf_dim, &(ptr->mf[1]), ptr->mf[0], grad.mf_g,
grad.show); // for local test
}
}
*/

__device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, float* ptr, const float* grad, curandState& state) {
float grad_show = grad[gpu_accessor_.common_push_value.ShowIndex()];
float grad_clk = grad[gpu_accessor_.common_push_value.ClickIndex()];

ptr[gpu_accessor_.common_feature_value.SlotIndex()] =
grad[gpu_accessor_.common_push_value.SlotIndex()];

ptr[gpu_accessor_.common_feature_value.ShowIndex()] += grad_show;
ptr[gpu_accessor_.common_feature_value.ClickIndex()] += grad_clk;

ptr[gpu_accessor_.common_feature_value.DeltaScoreIndex()] +=
optimizer_config.nonclk_coeff * (grad_show - grad_clk) +
optimizer_config.clk_coeff * grad_clk;

float ptr_show = ptr[gpu_accessor_.common_feature_value.ShowIndex()];
float ptr_clk = ptr[gpu_accessor_.common_feature_value.ClickIndex()];
float grad_lr_g = grad[gpu_accessor_.common_push_value.EmbedGIndex()];

float ptr_mf_size = ptr[gpu_accessor_.common_feature_value.MfSizeIndex()];
int ptr_mf_dim = (int)(ptr[gpu_accessor_.common_feature_value.MfDimIndex()]);

update_lr(
optimizer_config,
ptr[gpu_accessor_.common_feature_value.EmbedWIndex()],
ptr[gpu_accessor_.common_feature_value.EmbedG2SumIndex()],
grad_lr_g,
grad_show);

if (ptr_mf_size == (float)0) {
if (optimizer_config.mf_create_thresholds <=
optimizer_config.nonclk_coeff * (ptr_show - ptr_clk) +
optimizer_config.clk_coeff * ptr_clk) {

ptr[gpu_accessor_.common_feature_value.MfSizeIndex()] =
gpu_accessor_.common_feature_value.MFSize(ptr_mf_dim) / sizeof(float);

// get embedxw index
int embedx_w_index = gpu_accessor_.common_feature_value.EmbedxWOffsetIndex(ptr);

for (int i = 0; i < ptr_mf_dim; ++i) {
ptr[embedx_w_index + i] =
(curand_uniform(&state)) * optimizer_config.mf_initial_range;
ptr[gpu_accessor_.common_feature_value.EmbedxG2SumIndex() + i] = 0;
}
}
} else {
int embedx_w_index = gpu_accessor_.common_feature_value.EmbedxWOffsetIndex(ptr);
update_mf(
optimizer_config,
ptr_mf_dim,
&ptr[embedx_w_index],
&ptr[gpu_accessor_.common_feature_value.EmbedxG2SumIndex()],
&grad[gpu_accessor_.common_push_value.EmbedxGIndex()],
grad_show);
}
}

private:
GPUAccessor gpu_accessor_;
};



} // end namespace framework
} // end namespace paddle
Expand Down
1 change: 0 additions & 1 deletion paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1496,7 +1496,6 @@ void PSGPUWrapper::InitializeGPUServer(const std::string& fleet_desc) {
add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param());
add_sparse_optimizer(
config, sparse_table_accessor.embedx_sgd_param(), "mf_");
config["mf_embedx_dim"] = sparse_table_accessor.embedx_dim(); // default = 8
}
config["sparse_shard_num"] = sparse_table.shard_num();

Expand Down
8 changes: 0 additions & 8 deletions paddle/fluid/framework/fleet/ps_gpu_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,14 +241,6 @@ class PSGPUWrapper {
: config["sparse_shard_num"];
thread_keys_shard_num_ = sparse_shard_num;
VLOG(0) << "GPUPS set sparse shard num: " << thread_keys_shard_num_;

// set mf optimizer type
int mf_optimizer_type = (config.find("mf_optimizer_type") == config.end())
? 1
: config["mf_optimizer_type"];
optimizer_type_ = mf_optimizer_type;

VLOG(0) << "GPUPS set mf optimizer type:" << optimizer_type_;

hbm_thread_pool_.resize(thread_keys_shard_num_);
for (size_t i = 0; i < hbm_thread_pool_.size(); i++) {
Expand Down
3 changes: 0 additions & 3 deletions paddle/fluid/framework/ps_gpu_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -438,9 +438,6 @@ void PSGPUWorker::TrainFiles() {
std::chrono::microseconds(200));
}
thread_scope = cur_task.scope;
auto pack = cur_task.pack;
device_reader_->SetInsIdVec(pack);

// tensor share buffer
std::vector<Variable*>& cur_scope_vars = need_reuse_var_vec_[thread_scope];
PADDLE_ENFORCE_EQ(cur_scope_vars.size(), need_reuse_var_.size(),
Expand Down

0 comments on commit 506e0c3

Please sign in to comment.