From 506e0c360e57b4f05917f0030b0902e8d3fbcf1b Mon Sep 17 00:00:00 2001 From: zmxdream Date: Wed, 26 Oct 2022 20:46:58 +0800 Subject: [PATCH] Revert "[GPUPS]support for Std adagrad (#64)" This reverts commit 0e31882b0f023f20349cc43b315edbe911c6e09b. --- paddle/fluid/framework/data_feed.h | 12 -- paddle/fluid/framework/device_worker.cc | 2 +- .../framework/fleet/heter_ps/feature_value.h | 18 +-- .../framework/fleet/heter_ps/heter_ps.cu | 8 +- .../framework/fleet/heter_ps/optimizer.cuh.h | 153 ------------------ .../fluid/framework/fleet/ps_gpu_wrapper.cc | 1 - paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 8 - paddle/fluid/framework/ps_gpu_worker.cc | 3 - 8 files changed, 7 insertions(+), 198 deletions(-) diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index b04946ab46b50..fbb223217d1ec 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -923,7 +923,6 @@ class DataFeed { virtual const std::vector& GetInsIdVec() const { return ins_id_vec_; } - virtual void SetInsIdVec(MiniBatchGpuPack* pack) {} virtual const std::vector& GetInsContentVec() const { return ins_content_vec_; } @@ -1492,17 +1491,6 @@ class SlotRecordInMemoryDataFeed : public InMemoryDataFeed { // CustomParser* parser) {} virtual void PutToFeedVec(const std::vector& ins_vec) {} - virtual void SetInsIdVec(MiniBatchGpuPack* pack) { - if (parse_ins_id_) { - size_t ins_num = pack->ins_num(); - ins_id_vec_.clear(); - ins_id_vec_.resize(ins_num); - for(size_t i = 0; i < ins_num; i++) { - ins_id_vec_[i] = pack->get_lineid(i); - } - } - } - virtual void LoadIntoMemoryByCommand(void); virtual void LoadIntoMemoryByLib(void); virtual void LoadIntoMemoryByLine(void); diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc index 011c21ff3d825..880261436831d 100644 --- a/paddle/fluid/framework/device_worker.cc +++ b/paddle/fluid/framework/device_worker.cc @@ -168,7 +168,7 @@ void DeviceWorker::DumpField(const Scope& scope, int dump_mode, continue; } ars[i] += ins_id_vec[i]; - if (ins_content_vec.size() > i) ars[i] = ars[i] + "\t" + ins_content_vec[i]; + ars[i] = ars[i] + "\t" + ins_content_vec[i]; } for (auto& field : *dump_fields_) { Variable* var = scope.FindVar(field); diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index 2240d6054118b..e8611c7a73aa7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -167,13 +167,11 @@ class CommonFeatureValueAccessor { // 根据mf_dim计算的总长度 __host__ __device__ int Dim(int mf_dim) { - int tmp_embedx_sgd_dim = 1; // shared adagrad + int tmp_embedx_sgd_dim = 1; if (mf_optimizer_type_ == 3) {//adam tmp_embedx_sgd_dim = mf_dim * 2 + 2; } else if (mf_optimizer_type_ == 4) { //shared_adam tmp_embedx_sgd_dim = 4; - } else if (mf_optimizer_type_ == 2) { // std adagrad - tmp_embedx_sgd_dim = mf_dim; } return 9 + embed_sgd_dim + tmp_embedx_sgd_dim + mf_dim; } @@ -185,13 +183,11 @@ class CommonFeatureValueAccessor { // 根据mf_dim 计算的 mf_size byte数 __host__ __device__ int MFSize(int mf_dim) { - int tmp_embedx_sgd_dim = 1; // shared adagrad + int tmp_embedx_sgd_dim = 1; if (mf_optimizer_type_ == 3) { //adam tmp_embedx_sgd_dim = mf_dim * 2 + 2; } else if (mf_optimizer_type_ == 4) { //shared_adam tmp_embedx_sgd_dim = 4; - } else if (mf_optimizer_type_ = 2) { // std adagrad - tmp_embedx_sgd_dim = mf_dim; } return (tmp_embedx_sgd_dim + mf_dim) * sizeof(float); } @@ -200,14 +196,12 @@ class CommonFeatureValueAccessor { __host__ __device__ int EmbedxWOffsetIndex(float* val) { // has mf - int tmp_embedx_sgd_dim = 1; // shared adagrad + int tmp_embedx_sgd_dim = 1; if ((int)MfSize(val) > 0) { if (mf_optimizer_type_ == 3) {//adam tmp_embedx_sgd_dim = int(MfDim(val)) * 2 + 2; } else if (mf_optimizer_type_ == 4) { //shared_adam tmp_embedx_sgd_dim = 4; - } else if (mf_optimizer_type_ == 2) { // std adagrad - tmp_embedx_sgd_dim = int(MfDim(val)); } return EmbedxG2SumIndex() + tmp_embedx_sgd_dim; } else { @@ -303,6 +297,7 @@ class CommonFeatureValueAccessor { } }; + __host__ __device__ CommonFeatureValueAccessor() {} __host__ __device__ ~CommonFeatureValueAccessor() {} @@ -345,10 +340,7 @@ class CommonFeatureValueAccessor { } else if (optimizer_type == 4) { //shared_adam common_feature_value.embed_sgd_dim = 4; common_feature_value.embedx_sgd_dim = 4; - } else if (optimizer_type == 2) { // std adagrad - common_feature_value.embed_sgd_dim = 1; - common_feature_value.embedx_sgd_dim = sparse_embedx_dim; - } else if (optimizer_type == 1) { // shared adagrad + } else { common_feature_value.embed_sgd_dim = 1; common_feature_value.embedx_sgd_dim = 1; } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index 0517be6c5ac40..0acb7086fabb9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -25,18 +25,12 @@ HeterPsBase* HeterPsBase::get_instance( // NOTE(zhangminxu): gpups' sparse table optimizer type, // now only support embed&embedx 's sparse optimizer is the same // we will support using diff optimizer for embed&embedx + if (accessor_type == "DownpourCtrDymfAccessor" && optimizer_type == 1) { // optimizer_type == 1 means adagrad auto* accessor_wrapper_ptr = GlobalAccessorFactory::GetInstance().GetAccessorWrapper(); - if (accessor_type == "DownpourCtrDymfAccessor" && optimizer_type == 1) { // optimizer_type == 1 means adagrad - // auto* accessor_wrapper_ptr = - // GlobalAccessorFactory::GetInstance().GetAccessorWrapper(); CommonFeatureValueAccessor* gpu_accessor = ((AccessorWrapper*)accessor_wrapper_ptr)->AccessorPtr(); return new HeterPs(capacity, resource, *gpu_accessor); - } else if (accessor_type == "DownpourCtrDymfAccessor" && optimizer_type == 2) { // std_adagrad - CommonFeatureValueAccessor* gpu_accessor = - ((AccessorWrapper*)accessor_wrapper_ptr)->AccessorPtr(); - return new HeterPs(capacity, resource, *gpu_accessor); } else { CHECK(0) << " HeterPsBase get_instance Warning: now only support " "DownpourCtrDymfAccessor && SparseAdagradOptimizer, but get accessor_type:" diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h index bdb5dd621810a..f1712d1ce7799 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h @@ -162,159 +162,6 @@ class SparseAdagradOptimizer { GPUAccessor gpu_accessor_; }; -template -class StdAdagradOptimizer { - -public: - StdAdagradOptimizer() {} - StdAdagradOptimizer(GPUAccessor& gpu_accessor) { - gpu_accessor_ = gpu_accessor; - } - - ~StdAdagradOptimizer() {} - - __device__ void update_lr(const OptimizerConfig& optimizer_config, float& w, float& g2sum, float g, float scale) { - // double add_g2sum = 0; - double ratio = optimizer_config.learning_rate * - sqrt(optimizer_config.initial_g2sum / - (optimizer_config.initial_g2sum + g2sum)); - double scaled_grad = g / scale; - - w += scaled_grad * ratio; - - if (w < optimizer_config.min_bound) w = optimizer_config.min_bound; - if (w > optimizer_config.max_bound) w = optimizer_config.max_bound; - - g2sum += scaled_grad * scaled_grad; - - // g2sum += add_g2sum; - } - - __device__ int g2sum_index() { - return 0; - } - - __device__ void update_mf(const OptimizerConfig& optimizer_config, int n, float* w, float* sgd, const float* g, - float scale) { - // double add_g2sum = 0; - // double ratio = optimizer_config.mf_learning_rate * - // sqrt(optimizer_config.mf_initial_g2sum / - // (optimizer_config.mf_initial_g2sum + g2sum)); - for (int i = 0; i < n; ++i) { - float& g2sum = sgd[g2sum_index() + i]; - double scaled_grad = g[i] / scale; - - double ratio = optimizer_config.mf_learning_rate * - sqrt(optimizer_config.mf_initial_g2sum / - (optimizer_config.mf_initial_g2sum + g2sum)); - - w[i] += scaled_grad * ratio; - - if (w[i] < optimizer_config.mf_min_bound) - w[i] = optimizer_config.mf_min_bound; - if (w[i] > optimizer_config.mf_max_bound) - w[i] = optimizer_config.mf_max_bound; - - g2sum += scaled_grad * scaled_grad; - } - - // g2sum += add_g2sum / n; - } - - /* - __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, ValType* ptr, const GradType* grad) { - ptr->slot = grad.slot; - ptr->show += grad.show; - ptr->clk += grad.clk; - ptr->delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) + - optimizer_config.clk_coeff * grad.clk; - - update_lr(optimizer_config, ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show); - // ptr->mf_dim = grad.mf_dim; - - if (ptr->mf_size == 0) { - if (optimizer_config.mf_create_thresholds <= - optimizer_config.nonclk_coeff * (ptr->show - ptr->clk) + - optimizer_config.clk_coeff * ptr->clk) { - ptr->mf_size = ptr->mf_dim + 1; - ptr->mf[0] = 0; - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - curandState state; - curand_init(clock64(), tid_x, 0, &state); - for (int i = 0; i < ptr->mf_dim; ++i) { - ptr->mf[i + 1] = - (curand_uniform(&state)) * optimizer_config.mf_initial_range; - } - } - } else { - update_mf(optimizer_config, ptr->mf_dim, &(ptr->mf[1]), ptr->mf[0], grad.mf_g, - grad.show); // for local test - } - } - */ - - __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, float* ptr, const float* grad, curandState& state) { - float grad_show = grad[gpu_accessor_.common_push_value.ShowIndex()]; - float grad_clk = grad[gpu_accessor_.common_push_value.ClickIndex()]; - - ptr[gpu_accessor_.common_feature_value.SlotIndex()] = - grad[gpu_accessor_.common_push_value.SlotIndex()]; - - ptr[gpu_accessor_.common_feature_value.ShowIndex()] += grad_show; - ptr[gpu_accessor_.common_feature_value.ClickIndex()] += grad_clk; - - ptr[gpu_accessor_.common_feature_value.DeltaScoreIndex()] += - optimizer_config.nonclk_coeff * (grad_show - grad_clk) + - optimizer_config.clk_coeff * grad_clk; - - float ptr_show = ptr[gpu_accessor_.common_feature_value.ShowIndex()]; - float ptr_clk = ptr[gpu_accessor_.common_feature_value.ClickIndex()]; - float grad_lr_g = grad[gpu_accessor_.common_push_value.EmbedGIndex()]; - - float ptr_mf_size = ptr[gpu_accessor_.common_feature_value.MfSizeIndex()]; - int ptr_mf_dim = (int)(ptr[gpu_accessor_.common_feature_value.MfDimIndex()]); - - update_lr( - optimizer_config, - ptr[gpu_accessor_.common_feature_value.EmbedWIndex()], - ptr[gpu_accessor_.common_feature_value.EmbedG2SumIndex()], - grad_lr_g, - grad_show); - - if (ptr_mf_size == (float)0) { - if (optimizer_config.mf_create_thresholds <= - optimizer_config.nonclk_coeff * (ptr_show - ptr_clk) + - optimizer_config.clk_coeff * ptr_clk) { - - ptr[gpu_accessor_.common_feature_value.MfSizeIndex()] = - gpu_accessor_.common_feature_value.MFSize(ptr_mf_dim) / sizeof(float); - - // get embedxw index - int embedx_w_index = gpu_accessor_.common_feature_value.EmbedxWOffsetIndex(ptr); - - for (int i = 0; i < ptr_mf_dim; ++i) { - ptr[embedx_w_index + i] = - (curand_uniform(&state)) * optimizer_config.mf_initial_range; - ptr[gpu_accessor_.common_feature_value.EmbedxG2SumIndex() + i] = 0; - } - } - } else { - int embedx_w_index = gpu_accessor_.common_feature_value.EmbedxWOffsetIndex(ptr); - update_mf( - optimizer_config, - ptr_mf_dim, - &ptr[embedx_w_index], - &ptr[gpu_accessor_.common_feature_value.EmbedxG2SumIndex()], - &grad[gpu_accessor_.common_push_value.EmbedxGIndex()], - grad_show); - } - } - -private: - GPUAccessor gpu_accessor_; -}; - - } // end namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index a0c436d1ff6a6..66a99da060b72 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -1496,7 +1496,6 @@ void PSGPUWrapper::InitializeGPUServer(const std::string& fleet_desc) { add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param()); add_sparse_optimizer( config, sparse_table_accessor.embedx_sgd_param(), "mf_"); - config["mf_embedx_dim"] = sparse_table_accessor.embedx_dim(); // default = 8 } config["sparse_shard_num"] = sparse_table.shard_num(); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index edfce1171ae6d..dc4d80b7718db 100755 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -241,14 +241,6 @@ class PSGPUWrapper { : config["sparse_shard_num"]; thread_keys_shard_num_ = sparse_shard_num; VLOG(0) << "GPUPS set sparse shard num: " << thread_keys_shard_num_; - - // set mf optimizer type - int mf_optimizer_type = (config.find("mf_optimizer_type") == config.end()) - ? 1 - : config["mf_optimizer_type"]; - optimizer_type_ = mf_optimizer_type; - - VLOG(0) << "GPUPS set mf optimizer type:" << optimizer_type_; hbm_thread_pool_.resize(thread_keys_shard_num_); for (size_t i = 0; i < hbm_thread_pool_.size(); i++) { diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc index 27f458f475a14..b4b3ca5f2fc8f 100644 --- a/paddle/fluid/framework/ps_gpu_worker.cc +++ b/paddle/fluid/framework/ps_gpu_worker.cc @@ -438,9 +438,6 @@ void PSGPUWorker::TrainFiles() { std::chrono::microseconds(200)); } thread_scope = cur_task.scope; - auto pack = cur_task.pack; - device_reader_->SetInsIdVec(pack); - // tensor share buffer std::vector& cur_scope_vars = need_reuse_var_vec_[thread_scope]; PADDLE_ENFORCE_EQ(cur_scope_vars.size(), need_reuse_var_.size(),