Skip to content

Commit

Permalink
Merge pull request #17 from qingshui/paddlebox
Browse files Browse the repository at this point in the history
fix ins bug, add mean logloss gpu op
  • Loading branch information
qingshui authored Oct 29, 2021
2 parents edf364c + 1188ee6 commit 638d3f1
Show file tree
Hide file tree
Showing 17 changed files with 238 additions and 151 deletions.
2 changes: 1 addition & 1 deletion cmake/external/box_ps.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ IF((NOT DEFINED BOX_PS_VER) OR (NOT DEFINED BOX_PS_URL))
SET(BOX_PS_VER "0.1.1" CACHE STRING "" FORCE)
SET(BOX_PS_NAME "box_ps" CACHE STRING "" FORCE)
#SET(BOX_PS_URL "http://box-ps.gz.bcebos.com/box_ps.tar.gz" CACHE STRING "" FORCE)
SET(BOX_PS_URL "data-im.baidu.com:/home/work/var/CI_DATA/im/static/box_ps.tar.gz/box_ps.tar.gz.17" CACHE STRING "" FORCE)
SET(BOX_PS_URL "data-im.baidu.com:/home/work/var/CI_DATA/im/static/box_ps.tar.gz/box_ps.tar.gz.20" CACHE STRING "" FORCE)
ENDIF()
MESSAGE(STATUS "BOX_PS_NAME: ${BOX_PS_NAME}, BOX_PS_URL: ${BOX_PS_URL}")
SET(BOX_PS_SOURCE_DIR "${THIRD_PARTY_PATH}/box_ps")
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/boxps_trainer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ void BoxPSTrainer::InitDumpEnv() {
dump_thread_.push_back(
std::thread(std::bind(&TrainerBase::DumpWork, this, i)));
}
VLOG(0) << "init dump write file thread num=" << dump_thread_num_;
}

void BoxPSTrainer::CopyParameters(const Scope& root_scope, int device_id) {
Expand Down
8 changes: 6 additions & 2 deletions paddle/fluid/framework/boxps_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,9 @@ void BoxPSWorker::TrainFiles() {
int step = 0;
platform::SetDeviceId(device_id_);
while ((batch_size = PackBatchTask()) > 0) {
VLOG(3) << "begin running ops, batch size:" << batch_size;
VLOG(2) << "[" << device_id_
<< "]begin running ops, batch size:" << batch_size
<< ", batch id=" << step;
if (dense_table_) {
dense_table_->PullDense(place_, *thread_scope_);
}
Expand Down Expand Up @@ -560,7 +562,9 @@ void BoxPSWorker::TrainFilesWithProfiler() {
if (batch_size <= 0) {
break;
}
VLOG(3) << "begin running ops, read batch size: " << batch_size;
VLOG(2) << "[" << device_id_
<< "]begin running ops, batch size:" << batch_size
<< ", batch id=" << step_cnt;

cal_timer.Resume();
int op_id = 0;
Expand Down
5 changes: 3 additions & 2 deletions paddle/fluid/framework/data_set.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1768,7 +1768,8 @@ void PadBoxSlotDataset::ShuffleData(int thread_num) {
timer.Resume();
for (auto& t : data) {
int client_id = 0;
if (enable_pv_merge_) { // shuffle by pv
if (enable_pv_merge_ ||
FLAGS_enable_shuffle_by_searchid) { // shuffle by pv
client_id = t->search_id % mpi_size_;
} else if (merge_by_insid_) { // shuffle by lineid
client_id =
Expand Down Expand Up @@ -2041,7 +2042,7 @@ static void compute_batch_num(const int64_t ins_num, const int batch_size,
int cur_pos = 0;
int offset_num = static_cast<int>(ins_num / thread_batch_num) * thread_num;
int left_ins_num = static_cast<int>(ins_num % thread_batch_num);
if (left_ins_num > 0 && left_ins_num < thread_num) {
if (left_ins_num > 0 && left_ins_num < (thread_num * 2) && offset_num > 1) {
offset_num = offset_num - thread_num;
left_ins_num = left_ins_num + thread_batch_num;
for (int i = 0; i < offset_num; ++i) {
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/data_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
DECLARE_int32(padbox_dataset_shuffle_thread_num);
DECLARE_int32(padbox_dataset_merge_thread_num);
DECLARE_int32(padbox_max_shuffle_wait_count);
DECLARE_bool(enable_shuffle_by_searchid);
namespace boxps {
class PSAgentBase;
}
Expand Down
17 changes: 8 additions & 9 deletions paddle/fluid/framework/device_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -154,14 +154,14 @@ void DeviceWorker::DumpField(const Scope& scope, int dump_mode,
}
hit[i] = true;
if (FLAGS_lineid_have_extend_info) {
size_t pos = lineid.find(" ");
if (pos != std::string::npos) {
ars[i] += lineid.substr(0, pos);
} else {
ars[i] += lineid;
}
} else {
size_t pos = lineid.find(" ");
if (pos != std::string::npos) {
ars[i] += lineid.substr(0, pos);
} else {
ars[i] += lineid;
}
} else {
ars[i] += lineid;
}
}
for (auto& field : *dump_fields_) {
Expand Down Expand Up @@ -218,10 +218,9 @@ void DeviceWorker::DumpField(const Scope& scope, int dump_mode,
const std::string& lineid = device_reader_->GetLineId(i);
size_t pos = lineid.find(" ");
if (pos != std::string::npos) {
ars[i] = ars[i] + "\t" + lineid.substr(pos + 1);
ars[i] = ars[i] + "\t" + lineid.substr(pos + 1);
}
}

writer_ << ars[i];
}
}
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/framework/fleet/box_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,7 @@ void BoxWrapper::BeginFeedPass(int date, boxps::PSAgentBase** agent) {
VLOG(3) << "gpu cache dim:" << dim;
gpu_replica_cache.emplace_back(dim);
}
if (dataset_name_ == "InputTableDataset") {
if (input_table_dim_ > 0) {
VLOG(3) << "lookup input dim: " << input_table_dim_;
input_table_deque_.emplace_back(input_table_dim_);
}
Expand All @@ -587,7 +587,7 @@ void BoxWrapper::EndFeedPass(boxps::PSAgentBase* agent) {
t.ToHBM();
VLOG(0) << "gpu cache memory: " << t.GpuMemUsed() << "MB";
}
if (dataset_name_ == "InputTableDataset") {
if (input_table_dim_ > 0) {
auto& t = input_table_deque_.back();
VLOG(0) << "input table size: " << t.size() << " miss: " << t.miss()
<< ", cpu memory: " << t.CpuMemUsed() << "MB";
Expand All @@ -614,7 +614,7 @@ void BoxWrapper::EndPass(bool need_save_delta) {
if (FLAGS_use_gpu_replica_cache) {
gpu_replica_cache.pop_front();
}
if (dataset_name_ == "InputTableDataset") {
if (input_table_dim_ > 0) {
input_table_deque_.pop_front();
}
int ret = boxps_ptr_->EndPass(need_save_delta);
Expand Down
Loading

0 comments on commit 638d3f1

Please sign in to comment.