diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc index 7d8eb27cc7e760..b2b9dbe0b27ff5 100644 --- a/paddle/fluid/distributed/ps/service/ps_local_client.cc +++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc @@ -33,6 +33,9 @@ int32_t PsLocalClient::Initialize() { ::std::future PsLocalClient::Shrink(uint32_t table_id, const std::string threshold) { + // threshold not use + auto* table_ptr = GetTable(table_id); + table_ptr->Shrink(""); return done(); } diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc index ed2e54f5ff9c7c..509bc9eb5d111f 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.cc +++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc @@ -177,6 +177,96 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea( return res; } +paddle::framework::GpuPsCommGraphFloatFea GraphTable::make_gpu_ps_graph_float_fea( + int gpu_id, std::vector &node_ids, int float_slot_num) { + size_t shard_num = 64; + std::vector> bags(shard_num); + std::vector feature_array[shard_num]; + std::vector slot_id_array[shard_num]; + std::vector node_id_array[shard_num]; + std::vector node_fea_info_array[shard_num]; + for (size_t i = 0; i < shard_num; i++) { + auto predsize = node_ids.size() / shard_num; + bags[i].reserve(predsize * 1.2); + feature_array[i].reserve(predsize * 1.2 * float_slot_num); + slot_id_array[i].reserve(predsize * 1.2 * float_slot_num); + node_id_array[i].reserve(predsize * 1.2); + node_fea_info_array[i].reserve(predsize * 1.2); + } + + for (auto x : node_ids) { + int location = x % shard_num; + bags[location].push_back(x); + } + + std::vector> tasks; + + for (size_t i = 0; i < bags.size(); i++) { + if (bags[i].size() > 0) { + tasks.push_back(_cpu_worker_pool[gpu_id]->enqueue([&, i, this]() -> int { + uint64_t node_id; + paddle::framework::GpuPsFeaInfo x; + // std::vector feature_ids; + for (size_t j = 0; j < bags[i].size(); j++) { + Node *v = find_node(GraphTableType::FEATURE_TABLE, bags[i][j]); + node_id = bags[i][j]; + if (v == NULL) { + x.feature_size = 0; + x.feature_offset = 0; + node_fea_info_array[i].push_back(x); + } else { + // x <- v + x.feature_offset = feature_array[i].size(); + int total_feature_size = 0; + for (int k = 0; k < float_slot_num; ++k) { + auto float_feature_size = + v->get_float_feature(k, feature_array[i], slot_id_array[i]); + total_feature_size += float_feature_size; + } + x.feature_size = total_feature_size; + node_fea_info_array[i].push_back(x); + } + node_id_array[i].push_back(node_id); + } + return 0; + })); + } + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + + tasks.clear(); + + paddle::framework::GpuPsCommGraphFloatFea res; + uint64_t tot_len = 0; + for (size_t i = 0; i < shard_num; i++) { + tot_len += feature_array[i].size(); + } + VLOG(1) << "Loaded float feature table on cpu, float feature_list_size[" << tot_len + << "] node_ids_size[" << node_ids.size() << "]"; + res.init_on_cpu(tot_len, (unsigned int)node_ids.size(), float_slot_num); + unsigned int offset = 0, ind = 0; + for (size_t i = 0; i < shard_num; i++) { + tasks.push_back( + _cpu_worker_pool[gpu_id]->enqueue([&, i, ind, offset, this]() -> int { + auto start = ind; + for (size_t j = 0; j < node_id_array[i].size(); j++) { + res.node_list[start] = node_id_array[i][j]; + res.fea_info_list[start] = node_fea_info_array[i][j]; + res.fea_info_list[start++].feature_offset += offset; + } + for (size_t j = 0; j < feature_array[i].size(); j++) { + res.feature_list[offset + j] = feature_array[i][j]; + res.slot_id_list[offset + j] = slot_id_array[i][j]; + } + return 0; + })); + offset += feature_array[i].size(); + ind += node_id_array[i].size(); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + return res; +} + paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph( int idx, const std::vector &ids) { std::vector> bags(task_pool_size_); @@ -1231,16 +1321,19 @@ GraphNode *GraphShard::add_graph_node(Node *node) { return reinterpret_cast(bucket[node_location[id]]); } -FeatureNode *GraphShard::add_feature_node(uint64_t id, bool is_overlap) { +FeatureNode *GraphShard::add_feature_node(uint64_t id, bool is_overlap, int float_fea_num) { if (node_location.find(id) == node_location.end()) { node_location[id] = bucket.size(); - bucket.push_back(new FeatureNode(id)); + if (float_fea_num > 0) { + bucket.push_back(new FloatFeatureNode(id)); + } else { + bucket.push_back(new FeatureNode(id)); + } return reinterpret_cast(bucket[node_location[id]]); } if (is_overlap) { return reinterpret_cast(bucket[node_location[id]]); } - return NULL; } @@ -1851,10 +1944,15 @@ std::pair GraphTable::parse_node_file( local_count++; size_t index = shard_id - shard_start; + int slot_fea_num = 0; + if (feat_name.size() > 0) slot_fea_num = feat_name[idx].size(); + int float_fea_num = 0; + if (float_feat_id_map.size() > 0) float_fea_num = float_feat_id_map[idx].size(); if (load_slot) { - auto node = feature_shards[idx][index]->add_feature_node(id, false); + auto node = feature_shards[idx][index]->add_feature_node(id, false, float_fea_num); if (node != NULL) { - node->set_feature_size(feat_name[idx].size()); + if (slot_fea_num > 0) node->set_feature_size(slot_fea_num); + if (float_fea_num > 0) node->set_float_feature_size(float_fea_num); for (int i = 1; i < num; ++i) { auto &v = vals[i]; int ret = parse_feature(idx, v.ptr, v.len, node); @@ -1866,7 +1964,7 @@ std::pair GraphTable::parse_node_file( } } } else { - node_shards[idx][index]->add_feature_node(id, false); + node_shards[idx][index]->add_feature_node(id, false, float_fea_num); } local_valid_count++; } @@ -1920,8 +2018,10 @@ std::pair GraphTable::parse_node_file( } } size_t index = shard_id - shard_start; + int float_fea_num = 0; + if (float_feat_id_map.size() > 0) float_fea_num = float_feat_id_map[idx].size(); if (load_slot) { - auto node = feature_shards[idx][index]->add_feature_node(id, false); + auto node = feature_shards[idx][index]->add_feature_node(id, false, float_fea_num); if (node != NULL) { for (int i = 2; i < num; ++i) { auto &v = vals[i]; @@ -1934,7 +2034,7 @@ std::pair GraphTable::parse_node_file( } } } else { - node_shards[idx][index]->add_feature_node(id, false); + node_shards[idx][index]->add_feature_node(id, false, float_fea_num); } local_valid_count++; } @@ -2529,22 +2629,6 @@ int GraphTable::parse_feature(int idx, string_vector_2_string( fea_fields.begin(), fea_fields.end(), ' ', fea_ptr); return 0; - } else if (dtype == "float32") { - int ret = FeatureNode::parse_value_to_bytes( - fea_fields.begin(), fea_fields.end(), fea_ptr); - if (ret != 0) { - VLOG(0) << "Fail to parse value"; - return -1; - } - return 0; - } else if (dtype == "float64") { - int ret = FeatureNode::parse_value_to_bytes( - fea_fields.begin(), fea_fields.end(), fea_ptr); - if (ret != 0) { - VLOG(0) << "Fail to parse value"; - return -1; - } - return 0; } else if (dtype == "int32") { int ret = FeatureNode::parse_value_to_bytes( fea_fields.begin(), fea_fields.end(), fea_ptr); @@ -2563,10 +2647,36 @@ int GraphTable::parse_feature(int idx, return 0; } } else { - VLOG(10) << "feature_name[" << name << "] is not in feat_id_map, ntype_id[" - << idx << "] feat_id_map_size[" << feat_id_map.size() << "]"; + if (float_feat_id_map.size() > (size_t)idx) { + auto float_it = float_feat_id_map[idx].find(name); + if (float_it != float_feat_id_map[idx].end()) { + int32_t id = float_it->second; + std::string *fea_ptr = node->mutable_float_feature(id); + std::string dtype = this->float_feat_dtype[idx][id]; + if (dtype == "float32") { + int ret = FeatureNode::parse_value_to_bytes( + fea_fields.begin(), fea_fields.end(), fea_ptr); + if (ret != 0) { + VLOG(0) << "Fail to parse value"; + return -1; + } + return 0; + } + // else if (dtype == "float64") { // not used + // int ret = FeatureNode::parse_value_to_bytes( + // fea_fields.begin(), fea_fields.end(), fea_ptr); + // if (ret != 0) { + // VLOG(0) << "Fail to parse value"; + // return -1; + // } + // return 0; + // } + } else { + VLOG(4) << "feature_name[" << name << "] is not in feat_id_map, ntype_id[" + << idx << "] feat_id_map_size[" << feat_id_map.size() << "]"; + } + } } - return 0; } // thread safe shard vector merge @@ -2930,7 +3040,7 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) { auto feature = graph_feature[k]; id_to_feature.push_back(node_type); int feat_conf_size = static_cast(feature.name().size()); - + int feasign_idx = 0, float_idx = 0; for (int i = 0; i < feat_conf_size; i++) { // auto &f_name = common.attributes()[i]; // auto &f_shape = common.dims()[i]; @@ -2938,10 +3048,24 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) { auto &f_name = feature.name()[i]; auto &f_shape = feature.shape()[i]; auto &f_dtype = feature.dtype()[i]; - feat_name[k].push_back(f_name); - feat_shape[k].push_back(f_shape); - feat_dtype[k].push_back(f_dtype); - feat_id_map[k][f_name] = i; + if (f_dtype == "feasign" || f_dtype == "int64") { + feat_name[k].push_back(f_name); + feat_shape[k].push_back(f_shape); + feat_dtype[k].push_back(f_dtype); + feat_id_map[k][f_name] = feasign_idx++; + } + else if (f_dtype == "float32"){ + if (float_feat_id_map.size() < (size_t)node_types.size()) { + float_feat_name.resize(node_types.size()); + float_feat_shape.resize(node_types.size()); + float_feat_dtype.resize(node_types.size()); + float_feat_id_map.resize(node_types.size()); + } + float_feat_name[k].push_back(f_name); + float_feat_shape[k].push_back(f_shape); + float_feat_dtype[k].push_back(f_dtype); + float_feat_id_map[k][f_name] = float_idx++; + } VLOG(0) << "init graph table feat conf name:" << f_name << " shape:" << f_shape << " dtype:" << f_dtype; } diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h index 92083352b2f8e6..cd95b5300cb2f5 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.h +++ b/paddle/fluid/distributed/ps/table/common_graph_table.h @@ -122,7 +122,7 @@ class GraphShard { } GraphNode *add_graph_node(uint64_t id); GraphNode *add_graph_node(Node *node); - FeatureNode *add_feature_node(uint64_t id, bool is_overlap = true); + FeatureNode *add_feature_node(uint64_t id, bool is_overlap = true, int float_fea_num = 0); Node *find_node(uint64_t id); void delete_node(uint64_t id); void clear(); @@ -725,6 +725,8 @@ class GraphTable : public Table { int idx, const std::vector &ids); virtual paddle::framework::GpuPsCommGraphFea make_gpu_ps_graph_fea( int gpu_id, std::vector &node_ids, int slot_num); // NOLINT + virtual paddle::framework::GpuPsCommGraphFloatFea make_gpu_ps_graph_float_fea( + int gpu_id, std::vector &node_ids, int float_slot_num); // NOLINT int32_t Load_to_ssd(const std::string &path, const std::string ¶m); int64_t load_graph_to_memory_from_ssd(int idx, std::vector &ids); // NOLINT @@ -779,7 +781,13 @@ class GraphTable : public Table { std::vector> feat_name; std::vector> feat_dtype; std::vector> feat_shape; + std::vector> float_feat_name; + std::vector> float_feat_dtype; + std::vector> float_feat_shape; + // int slot_fea_num_{-1}; + // int float_fea_num_{-1}; std::vector> feat_id_map; + std::vector> float_feat_id_map; std::unordered_map feature_to_id, edge_to_id; std::vector id_to_feature, id_to_edge; std::string table_name; diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h index 3d6ee011f69c45..e52e36af6ddda4 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_node.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h @@ -62,6 +62,11 @@ class Node { std::vector &slot_id) const { // NOLINT return 0; } + virtual int get_float_feature(int slot_idx, + std::vector &feature_id, // NOLINT + std::vector &slot_id) const { // NOLINT + return 0; + } virtual void set_feature(int idx, const std::string &str) {} virtual void set_feature_size(int size) {} virtual void shrink_to_fit() {} @@ -188,6 +193,12 @@ class FeatureNode : public Node { return num; } + virtual int get_float_feature(int slot_idx, + std::vector &float_feature, // NOLINT + std::vector &slot_id) const { // NOLINT + return 0; + } + virtual std::string *mutable_feature(int idx) { if (idx >= static_cast(this->feature.size())) { this->feature.resize(idx + 1); @@ -195,14 +206,249 @@ class FeatureNode : public Node { return &(this->feature[idx]); } + virtual std::string *mutable_float_feature(int idx) { return NULL; } + virtual void set_feature(int idx, const std::string &str) { if (idx >= static_cast(this->feature.size())) { this->feature.resize(idx + 1); } this->feature[idx] = str; } - virtual void set_feature_size(int size) { this->feature.resize(size); } - virtual int get_feature_size() { return this->feature.size(); } + virtual void set_feature_size(int size) { + this->feature.resize(size); + } + virtual void set_float_feature_size(int size) {} + virtual int get_feature_size() { + return this->feature.size(); + } + virtual int get_float_feature_size() { return 0; } + virtual void shrink_to_fit() { + feature.shrink_to_fit(); + for (auto &slot : feature) { + slot.shrink_to_fit(); + } + } + + template + static std::string parse_value_to_bytes(std::vector feat_str) { + T v; + size_t Tsize = sizeof(T) * feat_str.size(); + char buffer[Tsize]; + for (size_t i = 0; i < feat_str.size(); i++) { + std::stringstream ss(feat_str[i]); + ss >> v; + std::memcpy( + buffer + sizeof(T) * i, reinterpret_cast(&v), sizeof(T)); + } + return std::string(buffer, Tsize); + } + + template + static void parse_value_to_bytes( + std::vector::iterator feat_str_begin, + std::vector::iterator feat_str_end, + std::string *output) { + T v; + size_t feat_str_size = feat_str_end - feat_str_begin; + size_t Tsize = sizeof(T) * feat_str_size; + char buffer[Tsize] = {'\0'}; + for (size_t i = 0; i < feat_str_size; i++) { + std::stringstream ss(*(feat_str_begin + i)); + ss >> v; + std::memcpy( + buffer + sizeof(T) * i, reinterpret_cast(&v), sizeof(T)); + } + output->assign(buffer); + } + + template + static std::vector parse_bytes_to_array(std::string feat_str) { + T v; + std::vector out; + size_t start = 0; + const char *buffer = feat_str.data(); + while (start < feat_str.size()) { + std::memcpy(reinterpret_cast(&v), buffer + start, sizeof(T)); + start += sizeof(T); + out.push_back(v); + } + return out; + } + + template + static int parse_value_to_bytes( + std::vector::iterator feat_str_begin, + std::vector::iterator feat_str_end, + std::string *output) { + size_t feat_str_size = feat_str_end - feat_str_begin; + size_t Tsize = sizeof(T) * feat_str_size; + size_t num = output->length(); + output->resize(num + Tsize); + + T *fea_ptrs = reinterpret_cast(&(*output)[num]); + + thread_local paddle::string::str_ptr_stream ss; + for (size_t i = 0; i < feat_str_size; i++) { + ss.reset(*(feat_str_begin + i)); + int len = ss.end - ss.ptr; + char *old_ptr = ss.ptr; + ss >> fea_ptrs[i]; + if (ss.ptr - old_ptr != len) { + return -1; + } + } + return 0; + } + + protected: + std::vector feature; +}; + +class FloatFeatureNode : public FeatureNode { + public: + FloatFeatureNode() : FeatureNode() {} + explicit FloatFeatureNode(uint64_t id) : FeatureNode(id) {} + virtual ~FloatFeatureNode() {} + virtual std::string get_feature(int idx) { + if (idx < static_cast(float_feature_start_idx)) { + return this->feature[idx]; + } else { + return std::string(""); + } + } + + virtual int get_feature_ids(std::vector *res) const { + PADDLE_ENFORCE_NOT_NULL(res, + paddle::platform::errors::InvalidArgument( + "get_feature_ids res should not be null")); + errno = 0; + for (int slot_idx = 0; slot_idx < float_feature_start_idx; slot_idx++) { + auto& feature_item = this->feature[slot_idx]; + // for (auto &feature_item : feature) { + const uint64_t *feas = (const uint64_t *)(feature_item.c_str()); + size_t num = feature_item.length() / sizeof(uint64_t); + CHECK((feature_item.length() % sizeof(uint64_t)) == 0) + << "bad feature_item: [" << feature_item << "]"; + size_t n = res->size(); + res->resize(n + num); + for (size_t i = 0; i < num; ++i) { + (*res)[n + i] = feas[i]; + } + } + PADDLE_ENFORCE_EQ( + errno, + 0, + paddle::platform::errors::InvalidArgument( + "get_feature_ids get errno should be 0, but got %d.", errno)); + return 0; + } + + virtual int get_feature_ids(int slot_idx, std::vector *res) const { + PADDLE_ENFORCE_NOT_NULL(res, + paddle::platform::errors::InvalidArgument( + "get_feature_ids res should not be null")); + res->clear(); + errno = 0; + if (slot_idx < static_cast(float_feature_start_idx)) { + const std::string &s = this->feature[slot_idx]; + const uint64_t *feas = (const uint64_t *)(s.c_str()); + + size_t num = s.length() / sizeof(uint64_t); + CHECK((s.length() % sizeof(uint64_t)) == 0) + << "bad feature_item: [" << s << "]"; + res->resize(num); + for (size_t i = 0; i < num; ++i) { + (*res)[i] = feas[i]; + } + } + PADDLE_ENFORCE_EQ( + errno, + 0, + paddle::platform::errors::InvalidArgument( + "get_feature_ids get errno should be 0, but got %d.", errno)); + return 0; + } + + virtual int get_feature_ids(int slot_idx, + std::vector &feature_id, // NOLINT + std::vector &slot_id) const { // NOLINT + errno = 0; + size_t num = 0; + if (slot_idx < static_cast(float_feature_start_idx)) { + const std::string &s = this->feature[slot_idx]; + const uint64_t *feas = (const uint64_t *)(s.c_str()); + num = s.length() / sizeof(uint64_t); + CHECK((s.length() % sizeof(uint64_t)) == 0) + << "bad feature_item: [" << s << "]"; + for (size_t i = 0; i < num; ++i) { + feature_id.push_back(feas[i]); + slot_id.push_back(slot_idx); + } + } + PADDLE_ENFORCE_EQ( + errno, + 0, + paddle::platform::errors::InvalidArgument( + "get_feature_ids get errno should be 0, but got %d.", errno)); + return num; + } + + virtual int get_float_feature(int slot_idx, + std::vector &float_feature, // NOLINT + std::vector &slot_id) const { // NOLINT + errno = 0; + size_t num = 0; + if (float_feature_start_idx + slot_idx < static_cast(this->feature.size())) { + const std::string &s = this->feature[float_feature_start_idx + slot_idx]; + const float *feas = (const float *)(s.c_str()); + num = s.length() / sizeof(float); + CHECK((s.length() % sizeof(float)) == 0) + << "bad feature_item: [" << s << "]"; + for (size_t i = 0; i < num; ++i) { + float_feature.push_back(feas[i]); + slot_id.push_back(slot_idx); + } + } + PADDLE_ENFORCE_EQ( + errno, + 0, + paddle::platform::errors::InvalidArgument( + "get_feature_ids get errno should be 0, but got %d.", errno)); + return num; + } + + virtual std::string *mutable_feature(int idx) { + if (idx >= static_cast(this->feature.size())) { + this->feature.resize(idx + 1); + } + if (idx + 1 > float_feature_start_idx) float_feature_start_idx = idx + 1; + return &(this->feature[idx]); + } + + virtual std::string *mutable_float_feature(int idx) { + if (float_feature_start_idx + idx >= static_cast(this->feature.size())) { + this->feature.resize(float_feature_start_idx + idx + 1); + } + return &(this->feature[float_feature_start_idx + idx]); + } + + virtual void set_feature(int idx, const std::string &str) { + if (idx >= static_cast(this->feature.size())) { + this->feature.resize(idx + 1); + } + this->feature[idx] = str; + } + virtual void set_feature_size(int size) { + this->feature.resize(size); + float_feature_start_idx = size; + } + virtual void set_float_feature_size(int size) { this->feature.resize(float_feature_start_idx + size); } + virtual int get_feature_size() { + return float_feature_start_idx; + } + virtual int get_float_feature_size() { + return this->feature.size() - float_feature_start_idx; + } virtual void shrink_to_fit() { feature.shrink_to_fit(); for (auto &slot : feature) { @@ -283,6 +529,7 @@ class FeatureNode : public Node { protected: std::vector feature; + uint8_t float_feature_start_idx = 0; }; } // namespace distributed diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc index 975067d300ecb5..383e25224299d5 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc @@ -1050,18 +1050,26 @@ int32_t MemorySparseTable::Flush() { return 0; } int32_t MemorySparseTable::Shrink(const std::string ¶m) { VLOG(0) << "MemorySparseTable::Shrink"; - // TODO(zhaocaibei123): implement with multi-thread + std::atomic shrink_size_all{0}; + int thread_num = _real_local_shard_num; + omp_set_num_threads(thread_num); +#pragma omp parallel for schedule(dynamic) for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) { // Shrink + int feasign_size = 0; auto &shard = _local_shards[shard_id]; for (auto it = shard.begin(); it != shard.end();) { if (_value_accesor->Shrink(it.value().data())) { it = shard.erase(it); + ++feasign_size; } else { ++it; } } + shrink_size_all += feasign_size; } + VLOG(0) << "MemorySparseTable::Shrink success, shrink size:" + << shrink_size_all; return 0; } diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 9e7ff1612b360d..75994da3a7238b 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -2670,6 +2670,8 @@ bool SlotRecordInMemoryDataFeed::Start() { #endif #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) gpu_graph_data_generator_.SetFeedVec(feed_vec_); + // adapt for dense feature + gpu_graph_data_generator_.SetFeedInfo(&used_slots_info_); #endif return true; } diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu index ffc96dc79f12f8..5b0c2bfe864d97 100644 --- a/paddle/fluid/framework/data_feed.cu +++ b/paddle/fluid/framework/data_feed.cu @@ -31,9 +31,9 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_ps/hashtable.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/io/fs.h" +#include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/kernels/gpu/graph_reindex_funcs.h" #include "paddle/phi/kernels/graph_reindex_kernel.h" -#include "paddle/fluid/platform/collective_helper.h" DECLARE_bool(enable_opt_get_features); DECLARE_bool(graph_metapath_split_opt); @@ -343,7 +343,7 @@ __global__ void FillSlotValueOffsetKernel(const int ins_num, struct RandInt { int low, high; - __host__ __device__ RandInt(int low, int high) : low(low), high(high){}; + __host__ __device__ RandInt(int low, int high) : low(low), high(high) {} __host__ __device__ int operator()(const unsigned int n) const { thrust::default_random_engine rng; @@ -635,191 +635,190 @@ __global__ void FillActualNeighbors(int64_t *vals, int GraphDataGenerator::FillIdShowClkTensor(int total_instance, bool gpu_graph_training) { - id_tensor_ptr_ = - feed_vec_[0]->mutable_data({total_instance, 1}, this->place_); show_tensor_ptr_ = - feed_vec_[1]->mutable_data({total_instance}, this->place_); - clk_tensor_ptr_ = - feed_vec_[2]->mutable_data({total_instance}, this->place_); - if (gpu_graph_training) { - uint64_t *ins_cursor, *ins_buf; - ins_buf = reinterpret_cast(d_ins_buf_->ptr()); - ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance; - cudaMemcpyAsync(id_tensor_ptr_, - ins_cursor, - sizeof(uint64_t) * total_instance, - cudaMemcpyDeviceToDevice, - train_stream_); - - if (conf_.enable_pair_label) { - pair_label_ptr_ = feed_vec_[3]->mutable_data( - {total_instance / 2}, this->place_); - int32_t *pair_label_buf = - reinterpret_cast(d_pair_label_buf_->ptr()); - int32_t *pair_label_cursor = - pair_label_buf + ins_buf_pair_len_ - total_instance / 2; - cudaMemcpyAsync(pair_label_ptr_, - pair_label_cursor, - sizeof(int32_t) * total_instance / 2, - cudaMemcpyDeviceToDevice, - train_stream_); - } - } else { - // infer - uint64_t *d_type_keys = - reinterpret_cast(d_device_keys_[infer_cursor_]->ptr()); - d_type_keys += infer_node_start_; - infer_node_start_ += total_instance / 2; - CopyDuplicateKeys<<>>( - id_tensor_ptr_, d_type_keys, total_instance / 2); - } - + feed_vec_[0]->mutable_data({total_instance}, this->place_); GraphFillCVMKernel<<>>(show_tensor_ptr_, total_instance); + clk_tensor_ptr_ = + feed_vec_[1]->mutable_data({total_instance}, this->place_); GraphFillCVMKernel<<>>(clk_tensor_ptr_, total_instance); + + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + int feed_vec_idx = 2 + tensor_pair_idx * conf_.tensor_num_of_one_pair; + id_tensor_ptr_ = feed_vec_[feed_vec_idx++]->mutable_data( + {total_instance, 1}, this->place_); + if (gpu_graph_training) { + uint64_t *ins_buf = reinterpret_cast(d_ins_buf_[tensor_pair_idx]->ptr()); + uint64_t *ins_cursor = ins_buf + ins_buf_pair_len_[tensor_pair_idx] * 2 - total_instance; + cudaMemcpyAsync(id_tensor_ptr_, + ins_cursor, + sizeof(uint64_t) * total_instance, + cudaMemcpyDeviceToDevice, + train_stream_); + + if (conf_.enable_pair_label) { + pair_label_ptr_ = feed_vec_[feed_vec_idx++]->mutable_data( + {total_instance / 2}, this->place_); + int32_t *pair_label_buf = + reinterpret_cast(d_pair_label_buf_[tensor_pair_idx]->ptr()); + int32_t *pair_label_cursor = + pair_label_buf + ins_buf_pair_len_[tensor_pair_idx] - total_instance / 2; + cudaMemcpyAsync(pair_label_ptr_, + pair_label_cursor, + sizeof(int32_t) * total_instance / 2, + cudaMemcpyDeviceToDevice, + train_stream_); + } + } else { // infer + uint64_t *d_type_keys = reinterpret_cast( + d_device_keys_[tensor_pair_idx][infer_cursor_[tensor_pair_idx]]->ptr()); + d_type_keys += infer_node_start_[tensor_pair_idx]; + infer_node_start_[tensor_pair_idx] += total_instance / 2; + CopyDuplicateKeys<<>>( + id_tensor_ptr_, d_type_keys, total_instance / 2); + } + } // end for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + return 0; } int GraphDataGenerator::FillGraphIdShowClkTensor(int uniq_instance, int total_instance, int index) { - id_tensor_ptr_ = - feed_vec_[0]->mutable_data({uniq_instance, 1}, this->place_); show_tensor_ptr_ = - feed_vec_[1]->mutable_data({uniq_instance}, this->place_); + feed_vec_[0]->mutable_data({uniq_instance}, this->place_); + GraphFillCVMKernel<<>>(show_tensor_ptr_, uniq_instance); clk_tensor_ptr_ = - feed_vec_[2]->mutable_data({uniq_instance}, this->place_); - int index_offset = 0; - if (conf_.enable_pair_label) { - pair_label_ptr_ = - feed_vec_[3]->mutable_data({total_instance / 2}, this->place_); - int32_t *pair_label_buf = - reinterpret_cast(d_pair_label_buf_->ptr()); - int32_t *pair_label_cursor = - pair_label_buf + ins_buf_pair_len_ - total_instance / 2; - cudaMemcpyAsync(pair_label_ptr_, - pair_label_vec_[index]->ptr(), - sizeof(int32_t) * total_instance / 2, + feed_vec_[1]->mutable_data({uniq_instance}, this->place_); + GraphFillCVMKernel<<>>(clk_tensor_ptr_, uniq_instance); + + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + int feed_vec_idx = 2 + tensor_pair_idx * conf_.tensor_num_of_one_pair; + id_tensor_ptr_ = feed_vec_[feed_vec_idx++]->mutable_data( + {uniq_instance, 1}, this->place_); + cudaMemcpyAsync(id_tensor_ptr_, + final_sage_nodes_vec_[index]->ptr(), + sizeof(int64_t) * uniq_instance, cudaMemcpyDeviceToDevice, train_stream_); - } - if (!conf_.return_weight) { - index_offset = - id_offset_of_feed_vec_ + conf_.slot_num * 2 + 5 * samples_.size(); - } else { - index_offset = id_offset_of_feed_vec_ + conf_.slot_num * 2 + - 6 * samples_.size(); // add edge weights - } - index_tensor_ptr_ = feed_vec_[index_offset]->mutable_data( - {total_instance}, this->place_); - if (conf_.get_degree) { - degree_tensor_ptr_ = feed_vec_[index_offset + 1]->mutable_data( - {uniq_instance * edge_to_id_len_}, this->place_); - } - - int len_samples = samples_.size(); - int *num_nodes_tensor_ptr_[len_samples]; - int *next_num_nodes_tensor_ptr_[len_samples]; - int64_t *edges_src_tensor_ptr_[len_samples]; - int64_t *edges_dst_tensor_ptr_[len_samples]; - int *edges_split_tensor_ptr_[len_samples]; - float *edges_weight_tensor_ptr_[len_samples]; - - std::vector> edges_split_num_for_graph = - edges_split_num_vec_[index]; - std::vector> graph_edges = - graph_edges_vec_[index]; - int graph_edges_index = 0; - for (int i = 0; i < len_samples; i++) { - int offset = conf_.return_weight - ? (id_offset_of_feed_vec_ + 2 * conf_.slot_num + 6 * i) - : (id_offset_of_feed_vec_ + 2 * conf_.slot_num + 5 * i); - std::vector edges_split_num = edges_split_num_for_graph[i]; - - int neighbor_len = edges_split_num[edge_to_id_len_ + 2]; - num_nodes_tensor_ptr_[i] = - feed_vec_[offset]->mutable_data({1}, this->place_); - next_num_nodes_tensor_ptr_[i] = - feed_vec_[offset + 1]->mutable_data({1}, this->place_); - edges_src_tensor_ptr_[i] = feed_vec_[offset + 2]->mutable_data( - {neighbor_len, 1}, this->place_); - edges_dst_tensor_ptr_[i] = feed_vec_[offset + 3]->mutable_data( - {neighbor_len, 1}, this->place_); - edges_split_tensor_ptr_[i] = feed_vec_[offset + 4]->mutable_data( - {edge_to_id_len_}, this->place_); - if (conf_.return_weight) { - edges_weight_tensor_ptr_[i] = feed_vec_[offset + 5]->mutable_data( - {neighbor_len, 1}, this->place_); + + if (conf_.enable_pair_label) { + pair_label_ptr_ = feed_vec_[feed_vec_idx++]->mutable_data( + {total_instance / 2}, this->place_); + int32_t *pair_label_buf = + reinterpret_cast(d_pair_label_buf_[tensor_pair_idx]->ptr()); + int32_t *pair_label_cursor = + pair_label_buf + ins_buf_pair_len_[tensor_pair_idx] - total_instance / 2; + cudaMemcpyAsync(pair_label_ptr_, + pair_label_vec_[index]->ptr(), + sizeof(int32_t) * total_instance / 2, + cudaMemcpyDeviceToDevice, + train_stream_); } - // [edges_split_num, next_num_nodes, num_nodes, neighbor_len] - cudaMemcpyAsync(next_num_nodes_tensor_ptr_[i], - edges_split_num.data() + edge_to_id_len_, - sizeof(int), - cudaMemcpyHostToDevice, - train_stream_); - cudaMemcpyAsync(num_nodes_tensor_ptr_[i], - edges_split_num.data() + edge_to_id_len_ + 1, - sizeof(int), - cudaMemcpyHostToDevice, - train_stream_); - cudaMemcpyAsync(edges_split_tensor_ptr_[i], - edges_split_num.data(), - sizeof(int) * edge_to_id_len_, - cudaMemcpyHostToDevice, - train_stream_); - cudaMemcpyAsync(edges_src_tensor_ptr_[i], - graph_edges[graph_edges_index++]->ptr(), - sizeof(int64_t) * neighbor_len, - cudaMemcpyDeviceToDevice, - train_stream_); - cudaMemcpyAsync(edges_dst_tensor_ptr_[i], - graph_edges[graph_edges_index++]->ptr(), - sizeof(int64_t) * neighbor_len, - cudaMemcpyDeviceToDevice, - train_stream_); - if (conf_.return_weight) { - cudaMemcpyAsync(edges_weight_tensor_ptr_[i], + feed_vec_idx += conf_.slot_num * 2; + + int len_samples = conf_.samples.size(); + int *num_nodes_tensor_ptr_[len_samples]; + int *next_num_nodes_tensor_ptr_[len_samples]; + int64_t *edges_src_tensor_ptr_[len_samples]; + int64_t *edges_dst_tensor_ptr_[len_samples]; + int *edges_split_tensor_ptr_[len_samples]; + float *edges_weight_tensor_ptr_[len_samples]; + std::vector> edges_split_num_for_graph = + edges_split_num_vec_[index]; + std::vector> graph_edges = + graph_edges_vec_[index]; + int graph_edges_index = 0; + for (int i = 0; i < len_samples; i++) { + std::vector edges_split_num = edges_split_num_for_graph[i]; + + int neighbor_len = edges_split_num[conf_.edge_to_id_len + 2]; + num_nodes_tensor_ptr_[i] = + feed_vec_[feed_vec_idx++]->mutable_data({1}, this->place_); + next_num_nodes_tensor_ptr_[i] = + feed_vec_[feed_vec_idx++]->mutable_data({1}, this->place_); + edges_src_tensor_ptr_[i] = feed_vec_[feed_vec_idx++]->mutable_data( + {neighbor_len, 1}, this->place_); + edges_dst_tensor_ptr_[i] = feed_vec_[feed_vec_idx++]->mutable_data( + {neighbor_len, 1}, this->place_); + edges_split_tensor_ptr_[i] = feed_vec_[feed_vec_idx++]->mutable_data( + {conf_.edge_to_id_len}, this->place_); + if (conf_.return_weight) { + edges_weight_tensor_ptr_[i] = feed_vec_[feed_vec_idx++]->mutable_data( + {neighbor_len, 1}, this->place_); + } + + // [edges_split_num, next_num_nodes, num_nodes, neighbor_len] + cudaMemcpyAsync(next_num_nodes_tensor_ptr_[i], + edges_split_num.data() + conf_.edge_to_id_len, + sizeof(int), + cudaMemcpyHostToDevice, + train_stream_); + cudaMemcpyAsync(num_nodes_tensor_ptr_[i], + edges_split_num.data() + conf_.edge_to_id_len + 1, + sizeof(int), + cudaMemcpyHostToDevice, + train_stream_); + cudaMemcpyAsync(edges_split_tensor_ptr_[i], + edges_split_num.data(), + sizeof(int) * conf_.edge_to_id_len, + cudaMemcpyHostToDevice, + train_stream_); + cudaMemcpyAsync(edges_src_tensor_ptr_[i], graph_edges[graph_edges_index++]->ptr(), - sizeof(float) * neighbor_len, + sizeof(int64_t) * neighbor_len, cudaMemcpyDeviceToDevice, train_stream_); - } - } + cudaMemcpyAsync(edges_dst_tensor_ptr_[i], + graph_edges[graph_edges_index++]->ptr(), + sizeof(int64_t) * neighbor_len, + cudaMemcpyDeviceToDevice, + train_stream_); + if (conf_.return_weight) { + cudaMemcpyAsync(edges_weight_tensor_ptr_[i], + graph_edges[graph_edges_index++]->ptr(), + sizeof(float) * neighbor_len, + cudaMemcpyDeviceToDevice, + train_stream_); + } + } // end for (int i = 0; i < len_samples; i++) { - cudaMemcpyAsync(id_tensor_ptr_, - final_sage_nodes_vec_[index]->ptr(), - sizeof(int64_t) * uniq_instance, - cudaMemcpyDeviceToDevice, - train_stream_); - cudaMemcpyAsync(index_tensor_ptr_, - inverse_vec_[index]->ptr(), - sizeof(int) * total_instance, - cudaMemcpyDeviceToDevice, - train_stream_); - if (conf_.get_degree) { - cudaMemcpyAsync(degree_tensor_ptr_, - node_degree_vec_[index]->ptr(), - sizeof(int) * uniq_instance * edge_to_id_len_, + index_tensor_ptr_ = feed_vec_[feed_vec_idx++]->mutable_data( + {total_instance}, this->place_); + cudaMemcpyAsync(index_tensor_ptr_, + inverse_vec_[index]->ptr(), + sizeof(int) * total_instance, cudaMemcpyDeviceToDevice, train_stream_); - } - GraphFillCVMKernel<<>>(show_tensor_ptr_, uniq_instance); - GraphFillCVMKernel<<>>(clk_tensor_ptr_, uniq_instance); + + if (conf_.get_degree) { + degree_tensor_ptr_ = feed_vec_[feed_vec_idx++]->mutable_data( + {uniq_instance * conf_.edge_to_id_len}, this->place_); + cudaMemcpyAsync(degree_tensor_ptr_, + node_degree_vec_[index]->ptr(), + sizeof(int) * uniq_instance * conf_.edge_to_id_len, + cudaMemcpyDeviceToDevice, + train_stream_); + } + } // end for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + return 0; } @@ -827,23 +826,40 @@ int GraphDataGenerator::FillGraphSlotFeature( int total_instance, bool gpu_graph_training, std::shared_ptr final_sage_nodes) { + int ret = 0; uint64_t *ins_cursor, *ins_buf; - if (gpu_graph_training) { - ins_buf = reinterpret_cast(d_ins_buf_->ptr()); - ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance; - } else { - id_tensor_ptr_ = - feed_vec_[0]->mutable_data({total_instance, 1}, this->place_); - ins_cursor = reinterpret_cast(id_tensor_ptr_); - } - if (!conf_.sage_mode) { - return FillSlotFeature(ins_cursor, total_instance); - } else { - uint64_t *sage_nodes_ptr = - reinterpret_cast(final_sage_nodes->ptr()); - return FillSlotFeature(sage_nodes_ptr, total_instance); + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + int feed_vec_idx = 2 + tensor_pair_idx * conf_.tensor_num_of_one_pair; + if (gpu_graph_training) { + ins_buf = reinterpret_cast(d_ins_buf_[tensor_pair_idx]->ptr()); + ins_cursor = ins_buf + ins_buf_pair_len_[tensor_pair_idx] * 2 - total_instance; + } else { + id_tensor_ptr_ = + feed_vec_[feed_vec_idx]->mutable_data({total_instance, 1}, this->place_); + ins_cursor = reinterpret_cast(id_tensor_ptr_); + } + if (!conf_.sage_mode) { + if (uint_slot_num_ > 0) { + ret += FillSlotFeature(ins_cursor, total_instance, tensor_pair_idx); + } + if (float_slot_num_ > 0) { + ret += FillFloatFeature(ins_cursor, total_instance, tensor_pair_idx); + } + } else { + uint64_t *sage_nodes_ptr = + reinterpret_cast(final_sage_nodes->ptr()); + if (uint_slot_num_ > 0) { + ret += FillSlotFeature(sage_nodes_ptr, total_instance, tensor_pair_idx); + } + if (float_slot_num_ > 0) { + ret += FillFloatFeature(sage_nodes_ptr, total_instance, tensor_pair_idx); + } + } } + + return ret; } int MakeInsPair(const std::shared_ptr &d_walk, // input @@ -851,36 +867,33 @@ int MakeInsPair(const std::shared_ptr &d_walk, // input const GraphDataGeneratorConfig &conf, const std::shared_ptr &d_random_row, const std::shared_ptr &d_random_row_col_shift, - BufState &buf_state, - std::shared_ptr &d_ins_buf, // output - std::shared_ptr &d_pair_label_buf, // output - std::shared_ptr &d_pair_num_ptr, // output - int &ins_buf_pair_len, + BufState *buf_state, + uint64_t *ins_buf, // output + int32_t *pair_label_buf, // output + int *d_pair_num, // output + int *ins_buf_pair_len_ptr, cudaStream_t stream) { uint64_t *walk = reinterpret_cast(d_walk->ptr()); uint8_t *walk_ntype = NULL; - uint8_t *excluded_train_pair = NULL; if (conf.need_walk_ntype) { walk_ntype = reinterpret_cast(d_walk_ntype->ptr()); } + uint8_t *excluded_train_pair = NULL; if (conf.excluded_train_pair_len > 0) { excluded_train_pair = reinterpret_cast(conf.d_excluded_train_pair->ptr()); } - uint64_t *ins_buf = reinterpret_cast(d_ins_buf->ptr()); - int32_t *pair_label_buf = NULL; int32_t *pair_label_conf = NULL; if (conf.enable_pair_label) { - pair_label_buf = reinterpret_cast(d_pair_label_buf->ptr()); pair_label_conf = reinterpret_cast(conf.d_pair_label_conf->ptr()); } int *random_row = reinterpret_cast(d_random_row->ptr()); int *random_row_col_shift = reinterpret_cast(d_random_row_col_shift->ptr()); - int *d_pair_num = reinterpret_cast(d_pair_num_ptr->ptr()); cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream); - int len = buf_state.len; + int len = buf_state->len; + int &ins_buf_pair_len = *ins_buf_pair_len_ptr; // make pair GraphFillIdKernel<<>>( @@ -889,10 +902,10 @@ int MakeInsPair(const std::shared_ptr &d_walk, // input d_pair_num, walk, walk_ntype, - random_row + buf_state.cursor, - random_row_col_shift + buf_state.cursor, - buf_state.central_word, - conf.window_step[buf_state.step], + random_row + buf_state->cursor, + random_row_col_shift + buf_state->cursor, + buf_state->central_word, + conf.window_step[buf_state->step], len, conf.walk_len, excluded_train_pair, @@ -926,19 +939,20 @@ int FillInsBuf(const std::shared_ptr &d_walk, // input const GraphDataGeneratorConfig &conf, const std::shared_ptr &d_random_row, const std::shared_ptr &d_random_row_col_shift, - BufState &buf_state, - std::shared_ptr &d_ins_buf, // output - std::shared_ptr &d_pair_label_buf, // output - std::shared_ptr &d_pair_num, // output - int &ins_buf_pair_len, + BufState *buf_state, + uint64_t *ins_buf, // output + int32_t *pair_label_buf, // output + int *pair_num_ptr, // output + int *ins_buf_pair_len_ptr, cudaStream_t stream) { + int &ins_buf_pair_len = *ins_buf_pair_len_ptr; if (ins_buf_pair_len >= conf.batch_size) { return conf.batch_size; } - int total_instance = AcquireInstance(&buf_state); + int total_instance = AcquireInstance(buf_state); VLOG(2) << "total_ins: " << total_instance; - buf_state.Debug(); + buf_state->Debug(); if (total_instance == 0) { return -1; @@ -949,25 +963,27 @@ int FillInsBuf(const std::shared_ptr &d_walk, // input d_random_row, d_random_row_col_shift, buf_state, - d_ins_buf, - d_pair_label_buf, - d_pair_num, - ins_buf_pair_len, + ins_buf, + pair_label_buf, + pair_num_ptr, + ins_buf_pair_len_ptr, stream); } int GraphDataGenerator::GenerateBatch() { - int total_instance = 0; + int total_instance = conf_.batch_size; platform::CUDADeviceGuard guard(conf_.gpuid); int res = 0; if (!conf_.gpu_graph_training) { // infer if (!conf_.sage_mode) { - total_instance = (infer_node_start_ + conf_.batch_size <= infer_node_end_) - ? conf_.batch_size - : infer_node_end_ - infer_node_start_; - VLOG(2) << "in graph_data generator:batch_size = " << conf_.batch_size - << " instance = " << total_instance; + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + int size = infer_node_end_[tensor_pair_idx] - infer_node_start_[tensor_pair_idx]; + if (size < total_instance) { + total_instance = size; + } + } total_instance *= 2; if (total_instance == 0) { return 0; @@ -981,45 +997,54 @@ int GraphDataGenerator::GenerateBatch() { total_instance_vec_[sage_batch_count_], sage_batch_count_); } - } else { - // train + } else { // train if (!conf_.sage_mode) { - while (ins_buf_pair_len_ < conf_.batch_size) { - res = FillInsBuf(d_walk_, - d_walk_ntype_, - conf_, - d_random_row_, - d_random_row_col_shift_, - buf_state_, - d_ins_buf_, - d_pair_label_buf_, - d_pair_num_, - ins_buf_pair_len_, - train_stream_); - if (res == -1) { - if (ins_buf_pair_len_ == 0) { - if (is_multi_node_) { - pass_end_ = 1; - if (total_row_ != 0) { - buf_state_.Reset(total_row_); - VLOG(1) - << "reset buf state to make batch num equal in multi node"; + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + while (ins_buf_pair_len_[tensor_pair_idx] < conf_.batch_size) { + int32_t *pair_label_buf = NULL; + if (conf_.enable_pair_label) { + pair_label_buf = + reinterpret_cast(d_pair_label_buf_[tensor_pair_idx]->ptr()); + } + res = FillInsBuf(d_walk_[tensor_pair_idx], + d_walk_ntype_[tensor_pair_idx], + conf_, + d_random_row_[tensor_pair_idx], + d_random_row_col_shift_[tensor_pair_idx], + &buf_state_[tensor_pair_idx], + reinterpret_cast(d_ins_buf_[tensor_pair_idx]->ptr()), + pair_label_buf, + reinterpret_cast(d_pair_num_[tensor_pair_idx]->ptr()), + &ins_buf_pair_len_[tensor_pair_idx], + train_stream_); + if (res == -1) { + if (ins_buf_pair_len_[tensor_pair_idx] == 0) { + if (conf_.is_multi_node) { + pass_end_ = 1; + if (total_row_[tensor_pair_idx] != 0) { + buf_state_[tensor_pair_idx].Reset(total_row_[tensor_pair_idx]); + VLOG(1) + << "reset buf state to make batch num equal in multi node"; + } + } else { + return 0; } } else { - return 0; + break; } - } else { - break; } - } - } - total_instance = ins_buf_pair_len_ < conf_.batch_size ? ins_buf_pair_len_ + } // end while (ins_buf_pair_len_ < conf_.batch_size) + } // end for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + + int min_ins_buf_pair_len = *std::min_element(ins_buf_pair_len_.begin(), ins_buf_pair_len_.end()); + total_instance = min_ins_buf_pair_len < conf_.batch_size ? min_ins_buf_pair_len : conf_.batch_size; total_instance *= 2; VLOG(2) << "total_instance: " << total_instance - << ", ins_buf_pair_len = " << ins_buf_pair_len_; + << ", ins_buf_pair_len = " << min_ins_buf_pair_len; FillIdShowClkTensor(total_instance, conf_.gpu_graph_training); - } else { + } else { // sage if (sage_batch_count_ == sage_batch_num_) { return 0; } @@ -1047,33 +1072,39 @@ int GraphDataGenerator::GenerateBatch() { sage_batch_count_ += 1; } LoD lod{offset_}; - feed_vec_[0]->set_lod(lod); - if (conf_.slot_num > 0) { - for (int i = 0; i < conf_.slot_num; ++i) { - feed_vec_[id_offset_of_feed_vec_ + 2 * i]->set_lod(lod); + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + int feed_vec_idx = 2 + tensor_pair_idx * conf_.tensor_num_of_one_pair; + feed_vec_[feed_vec_idx++]->set_lod(lod); + if (conf_.enable_pair_label) { + feed_vec_idx++; + } + //adapt for float feature + if (conf_.slot_num > 0) { + for (int i = 0; i < conf_.slot_num; ++i) { + if ((*feed_info_)[feed_vec_idx + 2 * i].type[0] == 'u') { + feed_vec_[feed_vec_idx + 2 * i]->set_lod(lod); + } + } } } cudaStreamSynchronize(train_stream_); if (!conf_.gpu_graph_training) return 1; if (!conf_.sage_mode) { - ins_buf_pair_len_ -= total_instance / 2; + ins_buf_pair_len_[0] -= total_instance / 2; } return 1; } -__global__ void GraphFillSampleKeysKernel(uint64_t *neighbors, - uint64_t *sample_keys, - int *prefix_sum, +__global__ void GraphFillSampleKeysKernel(int *prefix_sum, int *sampleidx2row, int *tmp_sampleidx2row, int *actual_sample_size, - int cur_degree, int len) { CUDA_KERNEL_LOOP(idx, len) { for (int k = 0; k < actual_sample_size[idx]; k++) { size_t offset = prefix_sum[idx] + k; - sample_keys[offset] = neighbors[idx * cur_degree + k]; tmp_sampleidx2row[offset] = sampleidx2row[idx] + k; } } @@ -1117,12 +1148,10 @@ __global__ void GraphFillFirstStepKernel(int *prefix_sum, int walk_degree, int col_size, int *actual_sample_size, - uint64_t *neighbors, - uint64_t *sample_keys) { + uint64_t *neighbors) { CUDA_KERNEL_LOOP(idx, len) { for (int k = 0; k < actual_sample_size[idx]; k++) { size_t row = prefix_sum[idx] + k; - sample_keys[row] = neighbors[idx * walk_degree + k]; sampleidx2row[row] = row; size_t offset = col_size * row; @@ -1159,6 +1188,29 @@ __global__ void get_each_ins_info(uint8_t *slot_list, } } +__global__ void get_each_ins_float_info(uint8_t *slot_list, + uint32_t *slot_size_list, + uint32_t *slot_size_prefix, + uint32_t *each_ins_slot_num, + uint32_t *each_ins_slot_num_inner_prefix, + size_t key_num, + int slot_num) { // offset of float slot + const size_t i = blockIdx.x * blockDim.y + threadIdx.y; + if (i < key_num) { + uint32_t slot_index = slot_size_prefix[i]; + size_t each_ins_slot_index = i * slot_num; + for (int j = 0; j < slot_size_list[i]; j++) { + each_ins_slot_num[each_ins_slot_index + slot_list[slot_index + j]] += 1; + } + each_ins_slot_num_inner_prefix[each_ins_slot_index] = 0; + for (int j = 1; j < slot_num; j++) { + each_ins_slot_num_inner_prefix[each_ins_slot_index + j] = + each_ins_slot_num[each_ins_slot_index + j - 1] + + each_ins_slot_num_inner_prefix[each_ins_slot_index + j - 1]; + } + } +} + __global__ void fill_slot_num(uint32_t *d_each_ins_slot_num_ptr, uint64_t **d_ins_slot_num_vector_ptr, size_t key_num, @@ -1172,7 +1224,7 @@ __global__ void fill_slot_num(uint32_t *d_each_ins_slot_num_ptr, } } } - +// 可以搞成模板 __global__ void fill_slot_tensor(uint64_t *feature_list, uint32_t *feature_size_prefixsum, uint32_t *each_ins_slot_num_inner_prefix, @@ -1193,6 +1245,26 @@ __global__ void fill_slot_tensor(uint64_t *feature_list, } } +__global__ void fill_float_tensor(float *feature_list, + uint32_t *feature_size_prefixsum, + uint32_t *each_ins_slot_num_inner_prefix, + uint64_t *ins_slot_num, + int64_t *slot_lod_tensor, + float *slot_tensor, + int slot, + int slot_num, + size_t node_num) { + const size_t i = blockIdx.x * blockDim.y + threadIdx.y; + if (i < node_num) { + size_t dst_index = slot_lod_tensor[i]; + size_t src_index = feature_size_prefixsum[i] + + each_ins_slot_num_inner_prefix[slot_num * i + slot]; + for (uint64_t j = 0; j < ins_slot_num[i]; j++) { + slot_tensor[dst_index + j] = feature_list[src_index + j]; + } + } +} + __global__ void GetUniqueFeaNum(uint64_t *d_in, uint64_t *unique_num, size_t len) { @@ -1260,14 +1332,12 @@ void FillOneStep( uint64_t *walk, uint8_t *walk_ntype, int len, - NeighborSampleResult &sample_res, + NeighborSampleResult *sample_res, int cur_degree, int step, const GraphDataGeneratorConfig &conf, - std::shared_ptr &d_sample_keys_ptr, - std::shared_ptr &d_prefix_sum_ptr, - std::vector> &d_sampleidx2rows, - int &cur_sampleidx2row, + std::vector> *d_sampleidx2rows, + int *cur_sampleidx2row, const paddle::platform::Place &place, cudaStream_t stream) { auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); @@ -1275,15 +1345,19 @@ void FillOneStep( uint8_t edge_src_id = node_id >> 32; uint8_t edge_dst_id = node_id; size_t temp_storage_bytes = 0; - int *d_actual_sample_size = sample_res.actual_sample_size; - uint64_t *d_neighbors = sample_res.val; + int *d_actual_sample_size = sample_res->actual_sample_size; + uint64_t *d_neighbors = sample_res->val; + auto d_prefix_sum_ptr = + memory::AllocShared(place, + (conf.once_max_sample_keynum + 1) * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int *d_prefix_sum = reinterpret_cast(d_prefix_sum_ptr->ptr()); - uint64_t *d_sample_keys = - reinterpret_cast(d_sample_keys_ptr->ptr()); + cudaMemsetAsync( + d_prefix_sum, 0, (conf.once_max_sample_keynum + 1) * sizeof(int), stream); int *d_sampleidx2row = - reinterpret_cast(d_sampleidx2rows[cur_sampleidx2row]->ptr()); - int *d_tmp_sampleidx2row = - reinterpret_cast(d_sampleidx2rows[1 - cur_sampleidx2row]->ptr()); + reinterpret_cast((*d_sampleidx2rows)[*cur_sampleidx2row]->ptr()); + int *d_tmp_sampleidx2row = reinterpret_cast( + (*d_sampleidx2rows)[1 - *cur_sampleidx2row]->ptr()); CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL, temp_storage_bytes, @@ -1318,18 +1392,14 @@ void FillOneStep( conf.walk_degree, conf.walk_len, d_actual_sample_size, - d_neighbors, - d_sample_keys); + d_neighbors); } else { GraphFillSampleKeysKernel<<>>( - d_neighbors, - d_sample_keys, d_prefix_sum, d_sampleidx2row, d_tmp_sampleidx2row, d_actual_sample_size, - cur_degree, len); GraphDoWalkKernel<<>>( @@ -1346,21 +1416,19 @@ void FillOneStep( edge_dst_id); } if (conf.debug_mode) { - size_t once_max_sample_keynum = - conf.walk_degree * conf.once_sample_startid_len; int *h_prefix_sum = new int[len + 1]; int *h_actual_size = new int[len]; - int *h_offset2idx = new int[once_max_sample_keynum]; + int *h_offset2idx = new int[conf.once_max_sample_keynum]; cudaMemcpy(h_offset2idx, d_tmp_sampleidx2row, - once_max_sample_keynum * sizeof(int), + conf.once_max_sample_keynum * sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(h_prefix_sum, d_prefix_sum, (len + 1) * sizeof(int), cudaMemcpyDeviceToHost); - for (int xx = 0; xx < once_max_sample_keynum; xx++) { + for (int xx = 0; xx < conf.once_max_sample_keynum; xx++) { VLOG(2) << "h_offset2idx[" << xx << "]: " << h_offset2idx[xx]; } for (int xx = 0; xx < len + 1; xx++) { @@ -1371,10 +1439,10 @@ void FillOneStep( delete[] h_offset2idx; } cudaStreamSynchronize(stream); - cur_sampleidx2row = 1 - cur_sampleidx2row; + *cur_sampleidx2row = 1 - *cur_sampleidx2row; } -int GraphDataGenerator::FillSlotFeature(uint64_t *d_walk, size_t key_num) { +int GraphDataGenerator::FillSlotFeature(uint64_t *d_walk, size_t key_num, int tensor_pair_idx) { platform::CUDADeviceGuard guard(conf_.gpuid); auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); std::shared_ptr d_feature_list; @@ -1382,16 +1450,14 @@ int GraphDataGenerator::FillSlotFeature(uint64_t *d_walk, size_t key_num) { size_t temp_bytes = (key_num + 1) * sizeof(uint32_t); if (d_feature_size_list_buf_ == NULL || - d_feature_size_list_buf_->size() < temp_bytes) { - d_feature_size_list_buf_ = - memory::AllocShared(this->place_, temp_bytes); + d_feature_size_list_buf_->size() < temp_bytes) { + d_feature_size_list_buf_ = memory::AllocShared(this->place_, temp_bytes); } if (d_feature_size_prefixsum_buf_ == NULL || - d_feature_size_prefixsum_buf_->size() < temp_bytes) { - d_feature_size_prefixsum_buf_ = - memory::AllocShared(this->place_, temp_bytes); + d_feature_size_prefixsum_buf_->size() < temp_bytes) { + d_feature_size_prefixsum_buf_ = + memory::AllocShared(this->place_, temp_bytes); } - int fea_num = gpu_graph_ptr->get_feature_info_of_nodes(conf_.gpuid, d_walk, @@ -1400,29 +1466,42 @@ int GraphDataGenerator::FillSlotFeature(uint64_t *d_walk, size_t key_num) { d_feature_size_prefixsum_buf_, d_feature_list, d_slot_list); - int64_t *slot_tensor_ptr_[conf_.slot_num]; - int64_t *slot_lod_tensor_ptr_[conf_.slot_num]; + // num of slot feature + int slot_num = conf_.slot_num - float_slot_num_; + int64_t *slot_tensor_ptr_[slot_num]; + int64_t *slot_lod_tensor_ptr_[slot_num]; + + int feed_vec_idx = 2 + tensor_pair_idx * conf_.tensor_num_of_one_pair; + ++feed_vec_idx; + if (conf_.enable_pair_label) { + ++feed_vec_idx; + } + if (fea_num == 0) { int64_t default_lod = 1; + int ii = 0; for (int i = 0; i < conf_.slot_num; ++i) { - slot_lod_tensor_ptr_[i] = - feed_vec_[id_offset_of_feed_vec_ + 2 * i + 1]->mutable_data( - {(long)key_num + 1}, this->place_); // NOLINT - slot_tensor_ptr_[i] = - feed_vec_[id_offset_of_feed_vec_ + 2 * i]->mutable_data( - {1, 1}, this->place_); - CUDA_CHECK(cudaMemsetAsync( - slot_tensor_ptr_[i], 0, sizeof(int64_t), train_stream_)); - CUDA_CHECK(cudaMemsetAsync(slot_lod_tensor_ptr_[i], - 0, - sizeof(int64_t) * key_num, - train_stream_)); - CUDA_CHECK(cudaMemcpyAsync( - reinterpret_cast(slot_lod_tensor_ptr_[i] + key_num), + if ((*feed_info_)[feed_vec_idx + 2 * i].type[0] == 'u') { + slot_lod_tensor_ptr_[ii] = + feed_vec_[feed_vec_idx + 2 * i + 1]->mutable_data( + {(long)key_num + 1}, this->place_); // NOLINT + slot_tensor_ptr_[ii] = + feed_vec_[feed_vec_idx + 2 * i]->mutable_data( + {1, 1}, this->place_); + CUDA_CHECK(cudaMemsetAsync( + slot_tensor_ptr_[ii], 0, sizeof(int64_t), train_stream_)); + CUDA_CHECK(cudaMemsetAsync(slot_lod_tensor_ptr_[ii], + 0, + sizeof(int64_t) * key_num, + train_stream_)); + CUDA_CHECK(cudaMemcpyAsync( + reinterpret_cast(slot_lod_tensor_ptr_[ii] + key_num), &default_lod, sizeof(int64_t), cudaMemcpyHostToDevice, train_stream_)); + ii++; + } } CUDA_CHECK(cudaStreamSynchronize(train_stream_)); return 0; @@ -1432,9 +1511,9 @@ int GraphDataGenerator::FillSlotFeature(uint64_t *d_walk, size_t key_num) { reinterpret_cast(d_feature_list->ptr()); uint8_t *d_slot_list_ptr = reinterpret_cast(d_slot_list->ptr()); uint32_t *d_feature_size_list_ptr = - reinterpret_cast(d_feature_size_list_buf_->ptr()); + reinterpret_cast(d_feature_size_list_buf_->ptr()); uint32_t *d_feature_size_prefixsum_ptr = - reinterpret_cast(d_feature_size_prefixsum_buf_->ptr()); + reinterpret_cast(d_feature_size_prefixsum_buf_->ptr()); VLOG(2) << "end trans feature list and slot list"; CUDA_CHECK(cudaStreamSynchronize(train_stream_)); @@ -1455,7 +1534,6 @@ int GraphDataGenerator::FillSlotFeature(uint64_t *d_walk, size_t key_num) { dim3 grid((key_num - 1) / 256 + 1); dim3 block(1, 256); - get_each_ins_info<<>>( d_slot_list_ptr, d_feature_size_list_ptr, @@ -1463,100 +1541,125 @@ int GraphDataGenerator::FillSlotFeature(uint64_t *d_walk, size_t key_num) { d_each_ins_slot_num_ptr, d_each_ins_slot_num_inner_prefix_ptr, key_num, - conf_.slot_num); + slot_num); - std::vector> ins_slot_num(conf_.slot_num, + std::vector> ins_slot_num(slot_num, nullptr); - std::vector ins_slot_num_vecotr(conf_.slot_num, NULL); + std::vector ins_slot_num_vecotr(slot_num, NULL); std::shared_ptr d_ins_slot_num_vector = - memory::AllocShared(place_, (conf_.slot_num) * sizeof(uint64_t *)); + memory::AllocShared(place_, (slot_num) * sizeof(uint64_t *)); uint64_t **d_ins_slot_num_vector_ptr = reinterpret_cast(d_ins_slot_num_vector->ptr()); + + int ii = 0; for (int i = 0; i < conf_.slot_num; i++) { - ins_slot_num[i] = memory::AllocShared(place_, key_num * sizeof(uint64_t)); - ins_slot_num_vecotr[i] = - reinterpret_cast(ins_slot_num[i]->ptr()); - } - CUDA_CHECK( - cudaMemcpyAsync(reinterpret_cast(d_ins_slot_num_vector_ptr), - ins_slot_num_vecotr.data(), - sizeof(uint64_t *) * conf_.slot_num, - cudaMemcpyHostToDevice, - train_stream_)); - fill_slot_num<<>>(d_each_ins_slot_num_ptr, - d_ins_slot_num_vector_ptr, - key_num, - conf_.slot_num); - CUDA_CHECK(cudaStreamSynchronize(train_stream_)); + if ((*feed_info_)[feed_vec_idx + 2 * i].type[0] == 'u') { + ins_slot_num[ii] = memory::AllocShared(place_, key_num * sizeof(uint64_t)); + ins_slot_num_vecotr[ii] = + reinterpret_cast(ins_slot_num[ii]->ptr()); + ii++; + } + } + if (slot_num > 0) { + CUDA_CHECK( + cudaMemcpyAsync(reinterpret_cast(d_ins_slot_num_vector_ptr), + ins_slot_num_vecotr.data(), + sizeof(uint64_t *) * slot_num, + cudaMemcpyHostToDevice, + train_stream_)); + fill_slot_num<<>>(d_each_ins_slot_num_ptr, + d_ins_slot_num_vector_ptr, + key_num, + slot_num); + CUDA_CHECK(cudaStreamSynchronize(train_stream_)); - for (int i = 0; i < conf_.slot_num; ++i) { - slot_lod_tensor_ptr_[i] = - feed_vec_[id_offset_of_feed_vec_ + 2 * i + 1]->mutable_data( + ii = 0; + for (int i = 0; i < conf_.slot_num; ++i) { + if ((*feed_info_)[feed_vec_idx + 2 * i].type[0] == 'u') { + slot_lod_tensor_ptr_[ii] = + feed_vec_[feed_vec_idx + 2 * i + 1]->mutable_data( {(long)key_num + 1}, this->place_); // NOLINT - } - size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL, - temp_storage_bytes, - ins_slot_num_vecotr[0], - slot_lod_tensor_ptr_[0] + 1, - key_num, - train_stream_)); - CUDA_CHECK(cudaStreamSynchronize(train_stream_)); - auto d_temp_storage = memory::Alloc( - this->place_, - temp_storage_bytes, - phi::Stream(reinterpret_cast(train_stream_))); - std::vector each_slot_fea_num(conf_.slot_num, 0); - for (int i = 0; i < conf_.slot_num; ++i) { - CUDA_CHECK(cudaMemsetAsync( - slot_lod_tensor_ptr_[i], 0, sizeof(uint64_t), train_stream_)); - CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(), + ii++; + } + } + + size_t temp_storage_bytes = 0; + CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL, temp_storage_bytes, - ins_slot_num_vecotr[i], - slot_lod_tensor_ptr_[i] + 1, + ins_slot_num_vecotr[0], + slot_lod_tensor_ptr_[0] + 1, key_num, train_stream_)); - CUDA_CHECK(cudaMemcpyAsync(&each_slot_fea_num[i], - slot_lod_tensor_ptr_[i] + key_num, - sizeof(uint64_t), - cudaMemcpyDeviceToHost, - train_stream_)); - } - CUDA_CHECK(cudaStreamSynchronize(train_stream_)); - for (int i = 0; i < conf_.slot_num; ++i) { - slot_tensor_ptr_[i] = - feed_vec_[id_offset_of_feed_vec_ + 2 * i]->mutable_data( - {each_slot_fea_num[i], 1}, this->place_); - } - int64_t default_lod = 1; - for (int i = 0; i < conf_.slot_num; ++i) { - fill_slot_tensor<<>>( - d_feature_list_ptr, - d_feature_size_prefixsum_ptr, - d_each_ins_slot_num_inner_prefix_ptr, - ins_slot_num_vecotr[i], - slot_lod_tensor_ptr_[i], - slot_tensor_ptr_[i], - i, - conf_.slot_num, - key_num); - // trick for empty tensor - if (each_slot_fea_num[i] == 0) { - slot_tensor_ptr_[i] = - feed_vec_[id_offset_of_feed_vec_ + 2 * i]->mutable_data( - {1, 1}, this->place_); - CUDA_CHECK(cudaMemsetAsync( - slot_tensor_ptr_[i], 0, sizeof(uint64_t), train_stream_)); - CUDA_CHECK(cudaMemcpyAsync( - reinterpret_cast(slot_lod_tensor_ptr_[i] + key_num), - &default_lod, - sizeof(int64_t), - cudaMemcpyHostToDevice, - train_stream_)); + CUDA_CHECK(cudaStreamSynchronize(train_stream_)); + + + auto d_temp_storage = memory::Alloc( + this->place_, + temp_storage_bytes, + phi::Stream(reinterpret_cast(train_stream_))); + + ii = 0; + std::vector each_slot_fea_num(slot_num, 0); + for (int i = 0; i < conf_.slot_num; ++i) { + if ((*feed_info_)[feed_vec_idx + 2 * i].type[0] == 'u') { + CUDA_CHECK(cudaMemsetAsync( + slot_lod_tensor_ptr_[ii], 0, sizeof(uint64_t), train_stream_)); + CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(), + temp_storage_bytes, + ins_slot_num_vecotr[ii], + slot_lod_tensor_ptr_[ii] + 1, + key_num, + train_stream_)); + CUDA_CHECK(cudaMemcpyAsync(&each_slot_fea_num[ii], + slot_lod_tensor_ptr_[ii] + key_num, + sizeof(uint64_t), + cudaMemcpyDeviceToHost, + train_stream_)); + ii++; + } + } + CUDA_CHECK(cudaStreamSynchronize(train_stream_)); + ii = 0; + for (int i = 0; i < conf_.slot_num; ++i) { + if ((*feed_info_)[feed_vec_idx + 2 * i].type[0] == 'u') { + slot_tensor_ptr_[ii] = feed_vec_[feed_vec_idx + 2 * i]->mutable_data( + {each_slot_fea_num[ii], 1}, this->place_); + ii++; + } + } + ii = 0; + int64_t default_lod = 1; + for (int i = 0; i < conf_.slot_num; ++i) { + if ((*feed_info_)[feed_vec_idx + 2 * i].type[0] == 'u') { + fill_slot_tensor<<>>( + d_feature_list_ptr, + d_feature_size_prefixsum_ptr, + d_each_ins_slot_num_inner_prefix_ptr, + ins_slot_num_vecotr[ii], + slot_lod_tensor_ptr_[ii], + slot_tensor_ptr_[ii], + ii, + slot_num, + key_num); + + // trick for empty tensor + if (each_slot_fea_num[ii] == 0) { + slot_tensor_ptr_[ii] = + feed_vec_[feed_vec_idx + 2 * i]->mutable_data({1, 1}, this->place_); + CUDA_CHECK(cudaMemsetAsync( + slot_tensor_ptr_[ii], 0, sizeof(uint64_t), train_stream_)); + CUDA_CHECK(cudaMemcpyAsync( + reinterpret_cast(slot_lod_tensor_ptr_[ii] + key_num), + &default_lod, + sizeof(int64_t), + cudaMemcpyHostToDevice, + train_stream_)); + } + ii++; + } } + CUDA_CHECK(cudaStreamSynchronize(train_stream_)); } - CUDA_CHECK(cudaStreamSynchronize(train_stream_)); - if (conf_.debug_mode) { std::vector h_feature_size_list(key_num, 0); std::vector h_feature_size_list_prefixsum(key_num, 0); @@ -1608,7 +1711,8 @@ int GraphDataGenerator::FillSlotFeature(uint64_t *d_walk, size_t key_num) { VLOG(0) << "all fea_num is " << fea_num << " calc fea_num is " << h_feature_size_list[key_num - 1] + h_feature_size_list_prefixsum[key_num - 1]; - for (int i = 0; i < conf_.slot_num; ++i) { + + for (int i = 0; i < slot_num; ++i) { std::vector h_slot_lod_tensor(key_num + 1, 0); CUDA_CHECK( cudaMemcpyAsync(reinterpret_cast(h_slot_lod_tensor.data()), @@ -1644,33 +1748,373 @@ int GraphDataGenerator::FillSlotFeature(uint64_t *d_walk, size_t key_num) { return 0; } -uint64_t CopyUniqueNodes( - HashTable *table, - uint64_t copy_unique_len, - const paddle::platform::Place &place, - const std::shared_ptr &d_uniq_node_num_ptr, - std::vector &host_vec, // output - cudaStream_t stream); +int GraphDataGenerator::FillFloatFeature(uint64_t *d_walk, size_t key_num, int tensor_pair_idx) { + platform::CUDADeviceGuard guard(conf_.gpuid); + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + std::shared_ptr d_feature_list; + std::shared_ptr d_slot_list; -// 对于deepwalk模式,尝试插入table,0表示插入成功,1表示插入失败; -// 对于sage模式,尝试插入table,table数量不够则清空table重新插入,返回值无影响。 -int InsertTable(const uint64_t *d_keys, // Input - uint64_t len, // Input - std::shared_ptr &d_uniq_node_num, - const GraphDataGeneratorConfig &conf, - uint64_t ©_unique_len, - const paddle::platform::Place &place, - HashTable *table, - std::vector &host_vec, // Output - cudaStream_t stream) { - // Used under NOT WHOLE_HBM. - uint64_t h_uniq_node_num = 0; - uint64_t *d_uniq_node_num_ptr = - reinterpret_cast(d_uniq_node_num->ptr()); - cudaMemcpyAsync(&h_uniq_node_num, - d_uniq_node_num_ptr, - sizeof(uint64_t), - cudaMemcpyDeviceToHost, + if (conf_.sage_mode) { + size_t temp_storage_bytes = (key_num + 1) * sizeof(uint32_t); + if (d_feature_size_list_buf_ == NULL || + d_feature_size_list_buf_->size() < temp_storage_bytes) { + d_feature_size_list_buf_ = + memory::AllocShared(this->place_, temp_storage_bytes); + } + if (d_feature_size_prefixsum_buf_ == NULL || + d_feature_size_prefixsum_buf_->size() < temp_storage_bytes) { + d_feature_size_prefixsum_buf_ = + memory::AllocShared(this->place_, temp_storage_bytes); + } + } + + uint32_t *d_feature_size_list_ptr = + reinterpret_cast(d_feature_size_list_buf_->ptr()); + uint32_t *d_feature_size_prefixsum_ptr = + reinterpret_cast(d_feature_size_prefixsum_buf_->ptr()); + + int fea_num = + gpu_graph_ptr->get_float_feature_info_of_nodes(conf_.gpuid, + d_walk, + key_num, + d_feature_size_list_ptr, + d_feature_size_prefixsum_ptr, + d_feature_list, + d_slot_list); + + int feed_vec_idx = 2 + tensor_pair_idx * conf_.tensor_num_of_one_pair; + ++feed_vec_idx; + if (conf_.enable_pair_label) { + ++feed_vec_idx; + } + + float *slot_tensor_ptr_[float_slot_num_]; + int64_t *slot_lod_tensor_ptr_[float_slot_num_]; + if (fea_num == 0) { + int64_t default_lod = 1; + int ii = 0; + for (int i = 0; i < conf_.slot_num; ++i) { + if ((*feed_info_)[feed_vec_idx + 2 * i].type[0] == 'f') { + slot_lod_tensor_ptr_[ii] = feed_vec_[feed_vec_idx + 2 * i + 1]->mutable_data( + {(long)key_num + 1}, this->place_); // NOLINT + slot_tensor_ptr_[ii] = + feed_vec_[feed_vec_idx + 2 * i]->mutable_data({1, 1}, this->place_); + CUDA_CHECK(cudaMemsetAsync( + slot_tensor_ptr_[ii], 0, sizeof(float), train_stream_)); + CUDA_CHECK(cudaMemsetAsync(slot_lod_tensor_ptr_[ii], + 0, + sizeof(int64_t) * key_num, + train_stream_)); + CUDA_CHECK(cudaMemcpyAsync( + reinterpret_cast(slot_lod_tensor_ptr_[ii] + key_num), + &default_lod, + sizeof(int64_t), + cudaMemcpyHostToDevice, + train_stream_)); + ii++; + } + } + CUDA_CHECK(cudaStreamSynchronize(train_stream_)); + return 0; + } + + float *d_feature_list_ptr = + reinterpret_cast(d_feature_list->ptr()); + uint8_t *d_slot_list_ptr = reinterpret_cast(d_slot_list->ptr()); + + std::shared_ptr d_each_ins_slot_num_inner_prefix = + memory::AllocShared(place_, (float_slot_num_ * key_num) * sizeof(uint32_t)); + std::shared_ptr d_each_ins_slot_num = + memory::AllocShared(place_, (float_slot_num_ * key_num) * sizeof(uint32_t)); + uint32_t *d_each_ins_slot_num_ptr = + reinterpret_cast(d_each_ins_slot_num->ptr()); + uint32_t *d_each_ins_slot_num_inner_prefix_ptr = + reinterpret_cast(d_each_ins_slot_num_inner_prefix->ptr()); + CUDA_CHECK(cudaMemsetAsync(d_each_ins_slot_num_ptr, + 0, + float_slot_num_ * key_num * sizeof(uint32_t), + train_stream_)); + + dim3 grid((key_num - 1) / 256 + 1); + dim3 block(1, 256); + get_each_ins_float_info<<>>( + d_slot_list_ptr, + d_feature_size_list_ptr, + d_feature_size_prefixsum_ptr, + d_each_ins_slot_num_ptr, + d_each_ins_slot_num_inner_prefix_ptr, + key_num, + float_slot_num_); + + std::vector> ins_slot_num(float_slot_num_, + nullptr); + std::vector ins_slot_num_vecotr(float_slot_num_, NULL); + std::shared_ptr d_ins_slot_num_vector = + memory::AllocShared(place_, (float_slot_num_) * sizeof(uint64_t *)); + uint64_t **d_ins_slot_num_vector_ptr = + reinterpret_cast(d_ins_slot_num_vector->ptr()); + + int ii = 0; + for (int i = 0; i < conf_.slot_num; i++) { + if ((*feed_info_)[feed_vec_idx + 2 * i].type[0] == 'f') { + ins_slot_num[ii] = memory::AllocShared(place_, key_num * sizeof(uint64_t)); + ins_slot_num_vecotr[ii] = + reinterpret_cast(ins_slot_num[ii]->ptr()); + ii++; + } + } + + if (float_slot_num_ > 0) { + CUDA_CHECK( + cudaMemcpyAsync(reinterpret_cast(d_ins_slot_num_vector_ptr), + ins_slot_num_vecotr.data(), + sizeof(uint64_t *) * float_slot_num_, + cudaMemcpyHostToDevice, + train_stream_)); + + fill_slot_num<<>>( + d_each_ins_slot_num_ptr, d_ins_slot_num_vector_ptr, key_num, float_slot_num_); + CUDA_CHECK(cudaStreamSynchronize(train_stream_)); + + ii = 0; + for (int i = 0; i < conf_.slot_num; ++i) { + if ((*feed_info_)[feed_vec_idx + 2 * i].type[0] == 'f') { + slot_lod_tensor_ptr_[ii] = feed_vec_[feed_vec_idx + 2 * i + 1]->mutable_data( + {(long)key_num + 1}, this->place_); // NOLINT + ii++; + } + } + size_t temp_storage_bytes = 0; + CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL, + temp_storage_bytes, + ins_slot_num_vecotr[0], + slot_lod_tensor_ptr_[0] + 1, + key_num, + train_stream_)); + CUDA_CHECK(cudaStreamSynchronize(train_stream_)); + auto d_temp_storage = memory::Alloc( + this->place_, + temp_storage_bytes, + phi::Stream(reinterpret_cast(train_stream_))); + std::vector each_slot_fea_num(float_slot_num_, 0); + + ii = 0; + for (int i = 0; i < conf_.slot_num; ++i) { + if ((*feed_info_)[feed_vec_idx + 2 * i].type[0] == 'f') { + CUDA_CHECK(cudaMemsetAsync( + slot_lod_tensor_ptr_[ii], 0, sizeof(uint64_t), train_stream_)); + CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(), + temp_storage_bytes, + ins_slot_num_vecotr[ii], + slot_lod_tensor_ptr_[ii] + 1, + key_num, + train_stream_)); + CUDA_CHECK(cudaMemcpyAsync(&each_slot_fea_num[ii], + slot_lod_tensor_ptr_[ii] + key_num, + sizeof(uint64_t), + cudaMemcpyDeviceToHost, + train_stream_)); + ii++; + } + } + CUDA_CHECK(cudaStreamSynchronize(train_stream_)); + ii = 0; + for (int i = 0; i < conf_.slot_num; ++i) { + if ((*feed_info_)[feed_vec_idx + 2 * i].type[0] == 'f') { + slot_tensor_ptr_[ii] = feed_vec_[feed_vec_idx + 2 * i]->mutable_data( + {each_slot_fea_num[ii], 1}, this->place_); + ii++; + } + } + ii = 0; + int64_t default_lod = 1; + for (int i = 0; i < conf_.slot_num; ++i) { + if ((*feed_info_)[feed_vec_idx + 2 * i].type[0] == 'f') { + fill_float_tensor<<>>( + d_feature_list_ptr, + d_feature_size_prefixsum_ptr, + d_each_ins_slot_num_inner_prefix_ptr, + ins_slot_num_vecotr[ii], + slot_lod_tensor_ptr_[ii], + slot_tensor_ptr_[ii], + ii, + float_slot_num_, + key_num); + // trick for empty tensor + if (each_slot_fea_num[ii] == 0) { + slot_tensor_ptr_[ii] = + feed_vec_[feed_vec_idx + 2 * i]->mutable_data({1, 1}, this->place_); + CUDA_CHECK(cudaMemsetAsync( + slot_tensor_ptr_[ii], 0, sizeof(float), train_stream_)); + CUDA_CHECK(cudaMemcpyAsync( + reinterpret_cast(slot_lod_tensor_ptr_[ii] + key_num), + &default_lod, + sizeof(int64_t), + cudaMemcpyHostToDevice, + train_stream_)); + } + ii++; + } + } + CUDA_CHECK(cudaStreamSynchronize(train_stream_)); + } + if (conf_.debug_mode) { + std::vector h_feature_size_list(key_num, 0); + std::vector h_feature_size_list_prefixsum(key_num, 0); + std::vector node_list(key_num, 0); + std::vector h_feature_list(fea_num, 0); + std::vector h_slot_list(fea_num, 0); + + CUDA_CHECK( + cudaMemcpyAsync(reinterpret_cast(h_feature_size_list.data()), + d_feature_size_list_ptr, + sizeof(uint32_t) * key_num, + cudaMemcpyDeviceToHost, + train_stream_)); + CUDA_CHECK(cudaMemcpyAsync( + reinterpret_cast(h_feature_size_list_prefixsum.data()), + d_feature_size_prefixsum_ptr, + sizeof(uint32_t) * key_num, + cudaMemcpyDeviceToHost, + train_stream_)); + CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast(node_list.data()), + d_walk, + sizeof(uint64_t) * key_num, + cudaMemcpyDeviceToHost, + train_stream_)); + + CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast(h_feature_list.data()), + d_feature_list_ptr, + sizeof(float) * fea_num, + cudaMemcpyDeviceToHost, + train_stream_)); + CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast(h_slot_list.data()), + d_slot_list_ptr, + sizeof(uint8_t) * fea_num, + cudaMemcpyDeviceToHost, + train_stream_)); + + CUDA_CHECK(cudaStreamSynchronize(train_stream_)); + for (size_t i = 0; i < key_num; i++) { + std::stringstream ss; + ss << "node_id: " << node_list[i] + << " fea_num: " << h_feature_size_list[i] << " offset " + << h_feature_size_list_prefixsum[i] << " slot: "; + for (uint32_t j = 0; j < h_feature_size_list[i]; j++) { + ss << int(h_slot_list[h_feature_size_list_prefixsum[i] + j]) << " : " + << h_feature_list[h_feature_size_list_prefixsum[i] + j] << " "; + } + VLOG(0) << ss.str(); + } + VLOG(0) << "all float fea_num is " << fea_num << " calc float fea_num is " + << h_feature_size_list[key_num - 1] + + h_feature_size_list_prefixsum[key_num - 1]; + for (int i = 0; i < float_slot_num_; ++i) { + std::vector h_slot_lod_tensor(key_num + 1, 0); + CUDA_CHECK( + cudaMemcpyAsync(reinterpret_cast(h_slot_lod_tensor.data()), + slot_lod_tensor_ptr_[i], + sizeof(int64_t) * (key_num + 1), + cudaMemcpyDeviceToHost, + train_stream_)); + CUDA_CHECK(cudaStreamSynchronize(train_stream_)); + std::stringstream ss_lod; + std::stringstream ss_tensor; + ss_lod << " slot " << i << " lod is ["; + for (size_t j = 0; j < key_num + 1; j++) { + ss_lod << h_slot_lod_tensor[j] << ","; + } + ss_lod << "]"; + std::vector h_slot_tensor(h_slot_lod_tensor[key_num], 0); + CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast(h_slot_tensor.data()), + slot_tensor_ptr_[i], + sizeof(float) * h_slot_lod_tensor[key_num], + cudaMemcpyDeviceToHost, + train_stream_)); + CUDA_CHECK(cudaStreamSynchronize(train_stream_)); + + ss_tensor << " tensor is [ "; + for (size_t j = 0; j < h_slot_lod_tensor[key_num]; j++) { + ss_tensor << h_slot_tensor[j] << ","; + } + ss_tensor << "]"; + VLOG(0) << ss_lod.str() << " " << ss_tensor.str(); + } + } + return 0; +} + +uint64_t CopyUniqueNodes( + HashTable *table, + uint64_t copy_unique_len, + const paddle::platform::Place &place, + const std::shared_ptr &d_uniq_node_num_ptr, + std::vector *host_vec_ptr, // output + cudaStream_t stream) { + if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { + uint64_t h_uniq_node_num = 0; + uint64_t *d_uniq_node_num = + reinterpret_cast(d_uniq_node_num_ptr->ptr()); + cudaMemcpyAsync(&h_uniq_node_num, + d_uniq_node_num, + sizeof(uint64_t), + cudaMemcpyDeviceToHost, + stream); + cudaStreamSynchronize(stream); + auto d_uniq_node = memory::AllocShared( + place, + h_uniq_node_num * sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); + uint64_t *d_uniq_node_ptr = + reinterpret_cast(d_uniq_node->ptr()); + + auto d_node_cursor = memory::AllocShared( + place, + sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); + + uint64_t *d_node_cursor_ptr = + reinterpret_cast(d_node_cursor->ptr()); + cudaMemsetAsync(d_node_cursor_ptr, 0, sizeof(uint64_t), stream); + // uint64_t unused_key = std::numeric_limits::max(); + table->get_keys(d_uniq_node_ptr, d_node_cursor_ptr, stream); + + cudaStreamSynchronize(stream); + + host_vec_ptr->resize(h_uniq_node_num + copy_unique_len); + cudaMemcpyAsync(host_vec_ptr->data() + copy_unique_len, + d_uniq_node_ptr, + sizeof(uint64_t) * h_uniq_node_num, + cudaMemcpyDeviceToHost, + stream); + cudaStreamSynchronize(stream); + return h_uniq_node_num; + } + return 0; +} + +// 对于deepwalk模式,尝试插入table,0表示插入成功,1表示插入失败; +// 对于sage模式,尝试插入table,table数量不够则清空table重新插入,返回值无影响。 +int InsertTable(const uint64_t *d_keys, // Input + uint64_t len, // Input + std::shared_ptr *d_uniq_node_num, + const GraphDataGeneratorConfig &conf, + uint64_t *copy_unique_len_ptr, + const paddle::platform::Place &place, + HashTable *table, + std::vector *host_vec_ptr, // Output + cudaStream_t stream) { + if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) { + return 0; + } + // Used under NOT WHOLE_HBM. + uint64_t h_uniq_node_num = 0; + uint64_t *d_uniq_node_num_ptr = + reinterpret_cast((*d_uniq_node_num)->ptr()); + cudaMemcpyAsync(&h_uniq_node_num, + d_uniq_node_num_ptr, + sizeof(uint64_t), + cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); @@ -1682,9 +2126,13 @@ int InsertTable(const uint64_t *d_keys, // Input return 1; } else { // Copy unique nodes first. - uint64_t copy_len = CopyUniqueNodes( - table, copy_unique_len, place, d_uniq_node_num, host_vec, stream); - copy_unique_len += copy_len; + uint64_t copy_len = CopyUniqueNodes(table, + *copy_unique_len_ptr, + place, + *d_uniq_node_num, + host_vec_ptr, + stream); + *copy_unique_len_ptr += copy_len; table->clear(stream); cudaMemsetAsync(d_uniq_node_num_ptr, 0, sizeof(uint64_t), stream); } @@ -1692,9 +2140,13 @@ int InsertTable(const uint64_t *d_keys, // Input } else { // used only for sage_mode. if (h_uniq_node_num + len >= conf.infer_table_cap) { - uint64_t copy_len = CopyUniqueNodes( - table, copy_unique_len, place, d_uniq_node_num, host_vec, stream); - copy_unique_len += copy_len; + uint64_t copy_len = CopyUniqueNodes(table, + *copy_unique_len_ptr, + place, + *d_uniq_node_num, + host_vec_ptr, + stream); + *copy_unique_len_ptr += copy_len; table->clear(stream); cudaMemsetAsync(d_uniq_node_num_ptr, 0, sizeof(uint64_t), stream); } @@ -1705,76 +2157,80 @@ int InsertTable(const uint64_t *d_keys, // Input return 0; } -std::vector> -GraphDataGenerator::SampleNeighbors(int64_t *uniq_nodes, - int len, - int sample_size, - std::vector &edges_split_num, - int64_t *neighbor_len) { +std::vector> SampleNeighbors( + int64_t *uniq_nodes, + int len, + int sample_size, + const GraphDataGeneratorConfig &conf, + std::vector *edges_split_num_ptr, + int64_t *neighbor_len, + std::vector> *edge_type_graph_ptr, + const paddle::platform::Place &place, + cudaStream_t stream) { auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); auto sample_res = gpu_graph_ptr->graph_neighbor_sample_sage( - conf_.gpuid, - edge_to_id_len_, + conf.gpuid, + conf.edge_to_id_len, reinterpret_cast(uniq_nodes), sample_size, len, - edge_type_graph_, - conf_.weighted_sample, - conf_.return_weight); + *edge_type_graph_ptr, + conf.weighted_sample, + conf.return_weight); int *all_sample_count_ptr = reinterpret_cast(sample_res.actual_sample_size_mem->ptr()); - auto cumsum_actual_sample_size = memory::Alloc( - place_, - (len * edge_to_id_len_ + 1) * sizeof(int), - phi::Stream(reinterpret_cast(sample_stream_))); + auto cumsum_actual_sample_size = + memory::Alloc(place, + (len * conf.edge_to_id_len + 1) * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int *cumsum_actual_sample_size_ptr = reinterpret_cast(cumsum_actual_sample_size->ptr()); cudaMemsetAsync(cumsum_actual_sample_size_ptr, 0, - (len * edge_to_id_len_ + 1) * sizeof(int), - sample_stream_); + (len * conf.edge_to_id_len + 1) * sizeof(int), + stream); size_t temp_storage_bytes = 0; CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL, temp_storage_bytes, all_sample_count_ptr, cumsum_actual_sample_size_ptr + 1, - len * edge_to_id_len_, - sample_stream_)); - auto d_temp_storage = memory::Alloc( - place_, - temp_storage_bytes, - phi::Stream(reinterpret_cast(sample_stream_))); + len * conf.edge_to_id_len, + stream)); + auto d_temp_storage = + memory::Alloc(place, + temp_storage_bytes, + phi::Stream(reinterpret_cast(stream))); CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(), temp_storage_bytes, all_sample_count_ptr, cumsum_actual_sample_size_ptr + 1, - len * edge_to_id_len_, - sample_stream_)); - cudaStreamSynchronize(sample_stream_); + len * conf.edge_to_id_len, + stream)); + cudaStreamSynchronize(stream); - edges_split_num.resize(edge_to_id_len_); - for (int i = 0; i < edge_to_id_len_; i++) { - cudaMemcpyAsync(edges_split_num.data() + i, + edges_split_num_ptr->resize(conf.edge_to_id_len); + for (int i = 0; i < conf.edge_to_id_len; i++) { + cudaMemcpyAsync(edges_split_num_ptr->data() + i, cumsum_actual_sample_size_ptr + (i + 1) * len, sizeof(int), cudaMemcpyDeviceToHost, - sample_stream_); + stream); } - CUDA_CHECK(cudaStreamSynchronize(sample_stream_)); + CUDA_CHECK(cudaStreamSynchronize(stream)); - int all_sample_size = edges_split_num[edge_to_id_len_ - 1]; - auto final_sample_val = memory::AllocShared( - place_, - all_sample_size * sizeof(int64_t), - phi::Stream(reinterpret_cast(sample_stream_))); - auto final_sample_val_dst = memory::AllocShared( - place_, - all_sample_size * sizeof(int64_t), - phi::Stream(reinterpret_cast(sample_stream_))); + int all_sample_size = (*edges_split_num_ptr)[conf.edge_to_id_len - 1]; + auto final_sample_val = + memory::AllocShared(place, + all_sample_size * sizeof(int64_t), + phi::Stream(reinterpret_cast(stream))); + auto final_sample_val_dst = + memory::AllocShared(place, + all_sample_size * sizeof(int64_t), + phi::Stream(reinterpret_cast(stream))); int64_t *final_sample_val_ptr = reinterpret_cast(final_sample_val->ptr()); int64_t *final_sample_val_dst_ptr = @@ -1784,37 +2240,37 @@ GraphDataGenerator::SampleNeighbors(int64_t *uniq_nodes, std::shared_ptr final_sample_weight; float *final_sample_weight_ptr = nullptr, *all_sample_weight_ptr = nullptr; - if (conf_.return_weight) { + if (conf.return_weight) { final_sample_weight = memory::AllocShared( - place_, + place, all_sample_size * sizeof(float), - phi::Stream(reinterpret_cast(sample_stream_))); + phi::Stream(reinterpret_cast(stream))); final_sample_weight_ptr = reinterpret_cast(final_sample_weight->ptr()); all_sample_weight_ptr = reinterpret_cast(sample_res.weight_mem->ptr()); } - FillActualNeighbors<<>>(all_sample_val_ptr, - final_sample_val_ptr, - final_sample_val_dst_ptr, - all_sample_count_ptr, - cumsum_actual_sample_size_ptr, - all_sample_weight_ptr, - final_sample_weight_ptr, - sample_size, - len * edge_to_id_len_, - len, - conf_.return_weight); + stream>>>(all_sample_val_ptr, + final_sample_val_ptr, + final_sample_val_dst_ptr, + all_sample_count_ptr, + cumsum_actual_sample_size_ptr, + all_sample_weight_ptr, + final_sample_weight_ptr, + sample_size, + len * conf.edge_to_id_len, + len, + conf.return_weight); *neighbor_len = all_sample_size; - cudaStreamSynchronize(sample_stream_); + cudaStreamSynchronize(stream); std::vector> sample_results; sample_results.emplace_back(final_sample_val); sample_results.emplace_back(final_sample_val_dst); - if (conf_.return_weight) { + if (conf.return_weight) { sample_results.emplace_back(final_sample_weight); } return sample_results; @@ -1969,29 +2425,37 @@ std::shared_ptr GetReindexResult( return final_nodes; } -std::shared_ptr GraphDataGenerator::GenerateSampleGraph( - uint64_t *node_ids, - int len, +std::shared_ptr GenerateSampleGraph( + uint64_t *node_ids, // input + int len, // input int *final_len, - std::shared_ptr &inverse) { - VLOG(2) << conf_.gpuid << " Get Unique Nodes"; + const GraphDataGeneratorConfig &conf, + std::vector> *inverse_vec_ptr, + std::vector>> + *graph_edges_vec_ptr, // output + std::vector>> + *edges_split_num_vec_ptr, // output + std::vector> *edge_type_graph_ptr, + const paddle::platform::Place &place, + cudaStream_t stream) { + VLOG(2) << conf.gpuid << " Get Unique Nodes"; - auto uniq_nodes = memory::Alloc( - place_, - len * sizeof(uint64_t), - phi::Stream(reinterpret_cast(sample_stream_))); - int *inverse_ptr = reinterpret_cast(inverse->ptr()); + auto inverse = memory::AllocShared(place, len * sizeof(uint32_t), + phi::Stream(reinterpret_cast(stream))); + uint32_t *inverse_ptr = reinterpret_cast(inverse->ptr()); + auto uniq_nodes = memory::Alloc(place, len * sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); int64_t *uniq_nodes_data = reinterpret_cast(uniq_nodes->ptr()); int uniq_len = dedup_keys_and_fillidx(node_ids, len, reinterpret_cast(uniq_nodes_data), - reinterpret_cast(inverse_ptr), - place_, - sample_stream_); - int len_samples = samples_.size(); + inverse_ptr, + place, + stream); + int len_samples = conf.samples.size(); - VLOG(2) << conf_.gpuid << " Sample Neighbors and Reindex"; + VLOG(2) << conf.gpuid << " Sample Neighbors and Reindex"; std::vector edges_split_num; std::vector> final_nodes_vec; std::vector> graph_edges; @@ -2005,12 +2469,16 @@ std::shared_ptr GraphDataGenerator::GenerateSampleGraph( if (i == 0) { auto sample_results = SampleNeighbors(uniq_nodes_data, uniq_len, - samples_[i], - edges_split_num, - &neighbors_len); + conf.samples[i], + conf, + &edges_split_num, + &neighbors_len, + edge_type_graph_ptr, + place, + stream); neighbors = sample_results[0]; reindex_dst = sample_results[1]; - if (conf_.return_weight) { + if (conf.return_weight) { weights = sample_results[2]; } edges_split_num.push_back(uniq_len); @@ -2019,12 +2487,16 @@ std::shared_ptr GraphDataGenerator::GenerateSampleGraph( reinterpret_cast(final_nodes_vec[i - 1]->ptr()); auto sample_results = SampleNeighbors(final_nodes_data, final_nodes_len_vec[i - 1], - samples_[i], - edges_split_num, - &neighbors_len); + conf.samples[i], + conf, + &edges_split_num, + &neighbors_len, + edge_type_graph_ptr, + place, + stream); neighbors = sample_results[0]; reindex_dst = sample_results[1]; - if (conf_.return_weight) { + if (conf.return_weight) { weights = sample_results[2]; } edges_split_num.push_back(final_nodes_len_vec[i - 1]); @@ -2036,11 +2508,11 @@ std::shared_ptr GraphDataGenerator::GenerateSampleGraph( auto tmp_final_nodes = GetReindexResult(reindex_src_data, uniq_nodes_data, &final_nodes_len, - conf_.reindex_table_size, + conf.reindex_table_size, uniq_len, neighbors_len, - place_, - sample_stream_); + place, + stream); final_nodes_vec.emplace_back(tmp_final_nodes); final_nodes_len_vec.emplace_back(final_nodes_len); } else { @@ -2049,11 +2521,11 @@ std::shared_ptr GraphDataGenerator::GenerateSampleGraph( auto tmp_final_nodes = GetReindexResult(reindex_src_data, final_nodes_data, &final_nodes_len, - conf_.reindex_table_size, + conf.reindex_table_size, final_nodes_len_vec[i - 1], neighbors_len, - place_, - sample_stream_); + place, + stream); final_nodes_vec.emplace_back(tmp_final_nodes); final_nodes_len_vec.emplace_back(final_nodes_len); } @@ -2063,428 +2535,125 @@ std::shared_ptr GraphDataGenerator::GenerateSampleGraph( edges_split_num.emplace_back(neighbors_len); graph_edges.emplace_back(neighbors); graph_edges.emplace_back(reindex_dst); - if (conf_.return_weight) { + if (conf.return_weight) { graph_edges.emplace_back(weights); } edges_split_num_for_graph.emplace_back(edges_split_num); } - graph_edges_vec_.emplace_back(graph_edges); - edges_split_num_vec_.emplace_back(edges_split_num_for_graph); + graph_edges_vec_ptr->emplace_back(graph_edges); + edges_split_num_vec_ptr->emplace_back(edges_split_num_for_graph); + inverse_vec_ptr->emplace_back(inverse); *final_len = final_nodes_len_vec[len_samples - 1]; return final_nodes_vec[len_samples - 1]; } -std::shared_ptr GraphDataGenerator::GetNodeDegree( - uint64_t *node_ids, int len) { - auto node_degree = memory::AllocShared( - place_, - len * edge_to_id_len_ * sizeof(int), - phi::Stream(reinterpret_cast(sample_stream_))); +std::shared_ptr GetNodeDegree( + uint64_t *node_ids, + int len, + const GraphDataGeneratorConfig &conf, + const paddle::platform::Place &place, + cudaStream_t stream) { + auto node_degree = + memory::AllocShared(place, + len * conf.edge_to_id_len * sizeof(int), + phi::Stream(reinterpret_cast(stream))); auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); auto edge_to_id = gpu_graph_ptr->edge_to_id; for (auto &iter : edge_to_id) { int edge_idx = iter.second; gpu_graph_ptr->get_node_degree( - conf_.gpuid, edge_idx, node_ids, len, node_degree); + conf.gpuid, edge_idx, node_ids, len, node_degree); } return node_degree; } -uint64_t CopyUniqueNodes( - HashTable *table, - uint64_t copy_unique_len, - const paddle::platform::Place &place, - const std::shared_ptr &d_uniq_node_num_ptr, - std::vector &host_vec, // output - cudaStream_t stream) { - if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { - uint64_t h_uniq_node_num = 0; - uint64_t *d_uniq_node_num = - reinterpret_cast(d_uniq_node_num_ptr->ptr()); - cudaMemcpyAsync(&h_uniq_node_num, - d_uniq_node_num, - sizeof(uint64_t), - cudaMemcpyDeviceToHost, - stream); - cudaStreamSynchronize(stream); - auto d_uniq_node = memory::AllocShared( - place, - h_uniq_node_num * sizeof(uint64_t), - phi::Stream(reinterpret_cast(stream))); - uint64_t *d_uniq_node_ptr = - reinterpret_cast(d_uniq_node->ptr()); - - auto d_node_cursor = memory::AllocShared( - place, - sizeof(uint64_t), - phi::Stream(reinterpret_cast(stream))); - - uint64_t *d_node_cursor_ptr = - reinterpret_cast(d_node_cursor->ptr()); - cudaMemsetAsync(d_node_cursor_ptr, 0, sizeof(uint64_t), stream); - // uint64_t unused_key = std::numeric_limits::max(); - table->get_keys(d_uniq_node_ptr, d_node_cursor_ptr, stream); - - cudaStreamSynchronize(stream); - - host_vec.resize(h_uniq_node_num + copy_unique_len); - cudaMemcpyAsync(host_vec.data() + copy_unique_len, - d_uniq_node_ptr, - sizeof(uint64_t) * h_uniq_node_num, - cudaMemcpyDeviceToHost, - stream); - cudaStreamSynchronize(stream); - return h_uniq_node_num; - } - return 0; -} - -void GraphDataGenerator::DoWalkandSage() { - int device_id = place_.GetDeviceId(); - debug_gpu_memory_info(device_id, "DoWalkandSage start"); - platform::CUDADeviceGuard guard(conf_.gpuid); - if (conf_.gpu_graph_training) { - // train - bool train_flag; - if (FLAGS_graph_metapath_split_opt) { - train_flag = FillWalkBufMultiPath(); - } else { - train_flag = FillWalkBuf(); - } - - if (conf_.sage_mode) { - sage_batch_num_ = 0; - if (train_flag) { - int total_instance = 0, uniq_instance = 0; - bool ins_pair_flag = true; - int sage_pass_end = 0; - uint64_t *ins_buf, *ins_cursor; - while (ins_pair_flag) { - int res = 0; - while (ins_buf_pair_len_ < conf_.batch_size) { - res = FillInsBuf(d_walk_, - d_walk_ntype_, - conf_, - d_random_row_, - d_random_row_col_shift_, - buf_state_, - d_ins_buf_, - d_pair_label_buf_, - d_pair_num_, - ins_buf_pair_len_, - sample_stream_); - if (res == -1) { - if (ins_buf_pair_len_ == 0) { - if (is_multi_node_) { - sage_pass_end = 1; - if (total_row_ != 0) { - buf_state_.Reset(total_row_); - VLOG(1) << "reset buf state to make batch num equal in multi node"; - } - } else { - ins_pair_flag = false; - break; - } - } else { - break; - } - } - } - - // check whether reach sage pass end - if (is_multi_node_) { - int res = multi_node_sync_sample(sage_pass_end, ncclProd); - if (res) { - ins_pair_flag = false; - } - } - - if (!ins_pair_flag) { - break; - } - - total_instance = ins_buf_pair_len_ < conf_.batch_size - ? ins_buf_pair_len_ - : conf_.batch_size; - total_instance *= 2; - - ins_buf = reinterpret_cast(d_ins_buf_->ptr()); - ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance; - auto inverse = memory::AllocShared( - place_, - total_instance * sizeof(int), - phi::Stream(reinterpret_cast(sample_stream_))); - auto final_sage_nodes = GenerateSampleGraph( - ins_cursor, total_instance, &uniq_instance, inverse); - uint64_t *final_sage_nodes_ptr = - reinterpret_cast(final_sage_nodes->ptr()); - if (conf_.get_degree) { - auto node_degrees = - GetNodeDegree(final_sage_nodes_ptr, uniq_instance); - node_degree_vec_.emplace_back(node_degrees); - } - - if (conf_.enable_pair_label) { - auto pair_label = memory::AllocShared( - place_, - total_instance / 2 * sizeof(int), - phi::Stream(reinterpret_cast(sample_stream_))); - int32_t *pair_label_buf = - reinterpret_cast(d_pair_label_buf_->ptr()); - int32_t *pair_label_cursor = - pair_label_buf + ins_buf_pair_len_ - total_instance / 2; - cudaMemcpyAsync(pair_label->ptr(), - pair_label_cursor, - sizeof(int32_t) * total_instance / 2, - cudaMemcpyDeviceToDevice, - sample_stream_); - pair_label_vec_.emplace_back(pair_label); - } - - cudaStreamSynchronize(sample_stream_); - if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { - uint64_t *final_sage_nodes_ptr = - reinterpret_cast(final_sage_nodes->ptr()); - InsertTable(final_sage_nodes_ptr, - uniq_instance, - d_uniq_node_num_, - conf_, - copy_unique_len_, - place_, - table_, - host_vec_, - sample_stream_); - } - final_sage_nodes_vec_.emplace_back(final_sage_nodes); - inverse_vec_.emplace_back(inverse); - uniq_instance_vec_.emplace_back(uniq_instance); - total_instance_vec_.emplace_back(total_instance); - ins_buf_pair_len_ -= total_instance / 2; - sage_batch_num_ += 1; - } - uint64_t h_uniq_node_num = CopyUniqueNodes(table_, - copy_unique_len_, - place_, - d_uniq_node_num_, - host_vec_, - sample_stream_); - VLOG(1) << "train sage_batch_num: " << sage_batch_num_; - } - } - } else { - // infer - bool infer_flag = FillInferBuf(); - if (conf_.sage_mode) { - sage_batch_num_ = 0; - if (infer_flag) { - // Set new batch size for multi_node - if (is_multi_node_) { - int new_batch_size = dynamic_adjust_batch_num_for_sage(); - conf_.batch_size = new_batch_size; - } - - int total_instance = 0, uniq_instance = 0; - total_instance = - (infer_node_start_ + conf_.batch_size <= infer_node_end_) - ? conf_.batch_size - : infer_node_end_ - infer_node_start_; - total_instance *= 2; - while (total_instance != 0) { - uint64_t *d_type_keys = reinterpret_cast( - d_device_keys_[infer_cursor_]->ptr()); - d_type_keys += infer_node_start_; - infer_node_start_ += total_instance / 2; - auto node_buf = memory::AllocShared( - place_, - total_instance * sizeof(uint64_t), - phi::Stream(reinterpret_cast(sample_stream_))); - int64_t *node_buf_ptr = reinterpret_cast(node_buf->ptr()); - CopyDuplicateKeys<<>>( - node_buf_ptr, d_type_keys, total_instance / 2); - uint64_t *node_buf_ptr_ = - reinterpret_cast(node_buf->ptr()); - auto inverse = memory::AllocShared( - place_, - total_instance * sizeof(int), - phi::Stream(reinterpret_cast(sample_stream_))); - auto final_sage_nodes = GenerateSampleGraph( - node_buf_ptr_, total_instance, &uniq_instance, inverse); - uint64_t *final_sage_nodes_ptr = - reinterpret_cast(final_sage_nodes->ptr()); - if (conf_.get_degree) { - auto node_degrees = - GetNodeDegree(final_sage_nodes_ptr, uniq_instance); - node_degree_vec_.emplace_back(node_degrees); - } - cudaStreamSynchronize(sample_stream_); - if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { - uint64_t *final_sage_nodes_ptr = - reinterpret_cast(final_sage_nodes->ptr()); - InsertTable(final_sage_nodes_ptr, - uniq_instance, - d_uniq_node_num_, - conf_, - copy_unique_len_, - place_, - table_, - host_vec_, - sample_stream_); - } - final_sage_nodes_vec_.emplace_back(final_sage_nodes); - inverse_vec_.emplace_back(inverse); - uniq_instance_vec_.emplace_back(uniq_instance); - total_instance_vec_.emplace_back(total_instance); - sage_batch_num_ += 1; - - total_instance = - (infer_node_start_ + conf_.batch_size <= infer_node_end_) - ? conf_.batch_size - : infer_node_end_ - infer_node_start_; - total_instance *= 2; - } - - uint64_t h_uniq_node_num = CopyUniqueNodes(table_, - copy_unique_len_, - place_, - d_uniq_node_num_, - host_vec_, - sample_stream_); - VLOG(1) << "infer sage_batch_num: " << sage_batch_num_; - } - } - } - debug_gpu_memory_info(device_id, "DoWalkandSage end"); -} - -void GraphDataGenerator::clear_gpu_mem() { - platform::CUDADeviceGuard guard(conf_.gpuid); - d_sample_keys_.reset(); - d_prefix_sum_.reset(); - for (size_t i = 0; i < d_sampleidx2rows_.size(); i++) { - d_sampleidx2rows_[i].reset(); +int multi_node_sync_sample(int flag, + const ncclRedOp_t &op, + const paddle::platform::Place &place, + phi::DenseTensor *multi_node_sync_stat_ptr) { + if (flag < 0 && flag > 2) { + VLOG(0) << "invalid flag! " << flag; + assert(false); + return -1; } - delete table_; -} -int GraphDataGenerator::FillInferBuf() { - platform::CUDADeviceGuard guard(conf_.gpuid); - auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); - auto &global_infer_node_type_start = - gpu_graph_ptr->global_infer_node_type_start_[conf_.gpuid]; - auto &infer_cursor = gpu_graph_ptr->infer_cursor_[conf_.thread_id]; - total_row_ = 0; - if (infer_cursor < h_device_keys_len_.size()) { - while (global_infer_node_type_start[infer_cursor] >= - h_device_keys_len_[infer_cursor]) { - infer_cursor++; - if (infer_cursor >= h_device_keys_len_.size()) { - return 0; - } - } - if (!infer_node_type_index_set_.empty()) { - while (infer_cursor < h_device_keys_len_.size()) { - if (infer_node_type_index_set_.find(infer_cursor) == - infer_node_type_index_set_.end()) { - VLOG(2) << "Skip cursor[" << infer_cursor << "]"; - infer_cursor++; - continue; - } else { - VLOG(2) << "Not skip cursor[" << infer_cursor << "]"; - break; - } - } - if (infer_cursor >= h_device_keys_len_.size()) { - return 0; - } - } - - size_t device_key_size = h_device_keys_len_[infer_cursor]; - total_row_ = - (global_infer_node_type_start[infer_cursor] + buf_size_ <= - device_key_size) - ? buf_size_ - : device_key_size - global_infer_node_type_start[infer_cursor]; - - uint64_t *d_type_keys = - reinterpret_cast(d_device_keys_[infer_cursor]->ptr()); - if (!conf_.sage_mode) { - host_vec_.resize(total_row_); - cudaMemcpyAsync(host_vec_.data(), - d_type_keys + global_infer_node_type_start[infer_cursor], - sizeof(uint64_t) * total_row_, - cudaMemcpyDeviceToHost, - sample_stream_); - cudaStreamSynchronize(sample_stream_); - } - VLOG(1) << "cursor: " << infer_cursor - << " start: " << global_infer_node_type_start[infer_cursor] - << " num: " << total_row_; - infer_node_start_ = global_infer_node_type_start[infer_cursor]; - global_infer_node_type_start[infer_cursor] += total_row_; - infer_node_end_ = global_infer_node_type_start[infer_cursor]; - infer_cursor_ = infer_cursor; - return 1; - } - return 0; + int ret = 0; +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_GPU_GRAPH) + int *stat_ptr = multi_node_sync_stat_ptr->data(); + auto comm = platform::NCCLCommContext::Instance().Get(0, place.GetDeviceId()); + auto stream = comm->stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + &stat_ptr[flag], &stat_ptr[3], 1, ncclInt, op, comm->comm(), stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&ret, // output + &stat_ptr[3], + sizeof(int), + cudaMemcpyDeviceToHost, + stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); +#endif + return ret; } -void GraphDataGenerator::ClearSampleState() { - auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); - auto &finish_node_type = gpu_graph_ptr->finish_node_type_[conf_.gpuid]; - auto &node_type_start = gpu_graph_ptr->node_type_start_[conf_.gpuid]; - finish_node_type.clear(); - for (auto iter = node_type_start.begin(); iter != node_type_start.end(); - iter++) { - iter->second = 0; - } -} +int FillWalkBuf(const std::vector &h_device_keys_len, + const std::vector> + &d_device_keys, // input + const std::vector> &meta_path, // input + const GraphDataGeneratorConfig &conf, + bool *epoch_finish_ptr, + uint64_t *copy_unique_len_ptr, + const paddle::platform::Place &place, + const std::vector &first_node_type, + std::unordered_map *node_type_start_ptr, + std::set *finish_node_type_ptr, + uint64_t *walk, // output + uint8_t *walk_ntype, + std::shared_ptr *d_uniq_node_num, + int *d_random_row, + int *d_random_row_col_shift, + phi::DenseTensor *multi_node_sync_stat_ptr, + std::vector *host_vec_ptr, + int *total_row_ptr, + size_t *jump_rows_ptr, + int *shuffle_seed_ptr, + HashTable *table, + BufState *buf_state, + cudaStream_t stream) { + platform::CUDADeviceGuard guard(conf.gpuid); -int GraphDataGenerator::FillWalkBuf() { - platform::CUDADeviceGuard guard(conf_.gpuid); - size_t once_max_sample_keynum = - conf_.walk_degree * conf_.once_sample_startid_len; //////// uint64_t *h_walk; - uint64_t *h_sample_keys; - int *h_offset2idx; - int *h_len_per_row; - uint64_t *h_prefix_sum; - if (conf_.debug_mode) { - h_walk = new uint64_t[buf_size_]; - h_sample_keys = new uint64_t[once_max_sample_keynum]; - h_offset2idx = new int[once_max_sample_keynum]; - h_len_per_row = new int[once_max_sample_keynum]; - h_prefix_sum = new uint64_t[once_max_sample_keynum + 1]; + if (conf.debug_mode) { + h_walk = new uint64_t[conf.buf_size]; } /////// - auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); - uint64_t *walk = reinterpret_cast(d_walk_->ptr()); - uint64_t *d_sample_keys = reinterpret_cast(d_sample_keys_->ptr()); - cudaMemsetAsync(walk, 0, buf_size_ * sizeof(uint64_t), sample_stream_); - uint8_t *walk_ntype = NULL; - if (conf_.need_walk_ntype) { - walk_ntype = reinterpret_cast(d_walk_ntype_->ptr()); - cudaMemsetAsync(walk_ntype, 0, buf_size_ * sizeof(uint8_t), sample_stream_); + cudaMemsetAsync(walk, 0, conf.buf_size * sizeof(uint64_t), stream); + if (conf.need_walk_ntype) { + cudaMemsetAsync(walk_ntype, 0, conf.buf_size * sizeof(uint8_t), stream); } int sample_times = 0; int i = 0; - total_row_ = 0; + *total_row_ptr = 0; + + std::vector> d_sampleidx2rows; + d_sampleidx2rows.push_back(memory::AllocShared( + place, + conf.once_max_sample_keynum * sizeof(int), + phi::Stream(reinterpret_cast(stream)))); + d_sampleidx2rows.push_back(memory::AllocShared( + place, + conf.once_max_sample_keynum * sizeof(int), + phi::Stream(reinterpret_cast(stream)))); + int cur_sampleidx2row = 0; // 获取全局采样状态 - auto &first_node_type = gpu_graph_ptr->first_node_type_; - auto &meta_path = gpu_graph_ptr->meta_path_; - auto &node_type_start = gpu_graph_ptr->node_type_start_[conf_.gpuid]; - auto &finish_node_type = gpu_graph_ptr->finish_node_type_[conf_.gpuid]; + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); auto &type_to_index = gpu_graph_ptr->get_graph_type_to_index(); - auto &cursor = gpu_graph_ptr->cursor_[conf_.thread_id]; + auto &cursor = gpu_graph_ptr->cursor_[conf.thread_id]; size_t node_type_len = first_node_type.size(); - int remain_size = buf_size_ - conf_.walk_degree * - conf_.once_sample_startid_len * - conf_.walk_len; + int remain_size = conf.buf_size - conf.walk_degree * + conf.once_sample_startid_len * + conf.walk_len; int total_samples = 0; // Definition of variables related to multi machine sampling @@ -2517,28 +2686,28 @@ int GraphDataGenerator::FillWalkBuf() { int cur_node_idx = cursor % node_type_len; int node_type = first_node_type[cur_node_idx]; auto &path = meta_path[cur_node_idx]; - size_t start = node_type_start[node_type]; + size_t start = (*node_type_start_ptr)[node_type]; int type_index = type_to_index[node_type]; - size_t device_key_size = h_device_keys_len_[type_index]; + size_t device_key_size = h_device_keys_len[type_index]; uint64_t *d_type_keys = - reinterpret_cast(d_device_keys_[type_index]->ptr()); - int tmp_len = start + conf_.once_sample_startid_len > device_key_size + reinterpret_cast(d_device_keys[type_index]->ptr()); + int tmp_len = start + conf.once_sample_startid_len > device_key_size ? device_key_size - start - : conf_.once_sample_startid_len; + : conf.once_sample_startid_len; VLOG(2) << "choose node_type: " << node_type << " cur_node_idx: " << cur_node_idx << " meta_path.size: " << meta_path.size() << " key_size: " << device_key_size << " start: " << start << " tmp_len: " << tmp_len; if (tmp_len == 0) { - finish_node_type.insert(node_type); - if (finish_node_type.size() == node_type_start.size()) { + finish_node_type_ptr->insert(node_type); + if (finish_node_type_ptr->size() == node_type_start_ptr->size()) { // scenarios 2: epoch finish if (FLAGS_enable_graph_multi_node_sampling) { sample_flag = EVENT_FINISH_EPOCH; } else { cursor = 0; - epoch_finish_ = true; + *epoch_finish_ptr = true; break; } } @@ -2558,8 +2727,9 @@ int GraphDataGenerator::FillWalkBuf() { // Perform synchronous information exchange between multiple machines // to decide whether to continue sampling if (FLAGS_enable_graph_multi_node_sampling) { - switch_command = multi_node_sync_sample(switch_flag, ncclProd); - VLOG(2) << "gpuid:" << conf_.gpuid << " multi node sample sync" + switch_command = multi_node_sync_sample( + switch_flag, ncclProd, place, multi_node_sync_stat_ptr); + VLOG(2) << "gpuid:" << conf.gpuid << " multi node sample sync" << " switch_flag:" << switch_flag << "," << switch_command; if (switch_command) { cursor += 1; @@ -2567,13 +2737,14 @@ int GraphDataGenerator::FillWalkBuf() { continue; } - sample_command = multi_node_sync_sample(sample_flag, ncclMax); - VLOG(2) << "gpuid:" << conf_.gpuid << " multi node sample sync" + sample_command = multi_node_sync_sample( + sample_flag, ncclMax, place, multi_node_sync_stat_ptr); + VLOG(2) << "gpuid:" << conf.gpuid << " multi node sample sync" << " sample_flag:" << sample_flag << "," << sample_command; if (sample_command == EVENT_FINISH_EPOCH) { // end sampling current epoch cursor = 0; - epoch_finish_ = true; + *epoch_finish_ptr = true; VLOG(0) << "sample epoch finish!"; break; } else if (sample_command == EVENT_WALKBUF_FULL) { @@ -2593,71 +2764,70 @@ int GraphDataGenerator::FillWalkBuf() { bool update = true; uint64_t *cur_walk = walk + i; uint8_t *cur_walk_ntype = NULL; - if (conf_.need_walk_ntype) { + if (conf.need_walk_ntype) { cur_walk_ntype = walk_ntype + i; } NeighborSampleQuery q; - q.initialize(conf_.gpuid, + q.initialize(conf.gpuid, path[0], (uint64_t)(d_type_keys + start), - conf_.walk_degree, + conf.walk_degree, tmp_len, step); auto sample_res = gpu_graph_ptr->graph_neighbor_sample_v3( - q, false, true, conf_.weighted_sample); + q, false, true, conf.weighted_sample); - jump_rows_ = sample_res.total_sample_size; + *jump_rows_ptr = sample_res.total_sample_size; total_samples += sample_res.total_sample_size; if (FLAGS_enable_graph_multi_node_sampling) { - int flag = jump_rows_ > 0 ? 1 : 0; - int command = multi_node_sync_sample(flag, ncclMax); - VLOG(2) << "gpuid:" << conf_.gpuid << " multi node step sync" + int flag = *jump_rows_ptr > 0 ? 1 : 0; + int command = multi_node_sync_sample( + flag, ncclMax, place, multi_node_sync_stat_ptr); + VLOG(2) << "gpuid:" << conf.gpuid << " multi node step sync" << " step:" << step << " step_sample:" << flag << "," << command; if (command <= 0) { - node_type_start[node_type] = tmp_len + start; + (*node_type_start_ptr)[node_type] = tmp_len + start; cursor += 1; continue; } - } else if (jump_rows_ == 0) { - node_type_start[node_type] = tmp_len + start; + } else if (*jump_rows_ptr == 0) { + (*node_type_start_ptr)[node_type] = tmp_len + start; cursor += 1; continue; } - if (!conf_.sage_mode) { - if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { - if (InsertTable(d_type_keys + start, - tmp_len, - d_uniq_node_num_, - conf_, - copy_unique_len_, - place_, - table_, - host_vec_, - sample_stream_) != 0) { - VLOG(2) << "gpu:" << conf_.gpuid - << " in step 0, insert key stage, table is full"; - update = false; - assert(false); - break; - } - if (InsertTable(sample_res.actual_val, - sample_res.total_sample_size, - d_uniq_node_num_, - conf_, - copy_unique_len_, - place_, - table_, - host_vec_, - sample_stream_) != 0) { - VLOG(2) << "gpu:" << conf_.gpuid - << " in step 0, insert sample res, table is full"; - update = false; - assert(false); - break; - } + if (!conf.sage_mode) { + if (InsertTable(d_type_keys + start, + tmp_len, + d_uniq_node_num, + conf, + copy_unique_len_ptr, + place, + table, + host_vec_ptr, + stream) != 0) { + VLOG(2) << "gpu:" << conf.gpuid + << " in step 0, insert key stage, table is full"; + update = false; + assert(false); + break; + } + if (InsertTable(sample_res.actual_val, + sample_res.total_sample_size, + d_uniq_node_num, + conf, + copy_unique_len_ptr, + place, + table, + host_vec_ptr, + stream) != 0) { + VLOG(2) << "gpu:" << conf.gpuid + << " in step 0, insert sample res, table is full"; + update = false; + assert(false); + break; } } FillOneStep(d_type_keys + start, @@ -2665,21 +2835,21 @@ int GraphDataGenerator::FillWalkBuf() { cur_walk, cur_walk_ntype, tmp_len, - sample_res, - conf_.walk_degree, + &sample_res, + conf.walk_degree, step, - conf_, - d_sample_keys_, - d_prefix_sum_, - d_sampleidx2rows_, - cur_sampleidx2row_, - place_, - sample_stream_); + conf, + &d_sampleidx2rows, + &cur_sampleidx2row, + place, + stream); ///////// - if (conf_.debug_mode) { - cudaMemcpy( - h_walk, walk, buf_size_ * sizeof(uint64_t), cudaMemcpyDeviceToHost); - for (int xx = 0; xx < buf_size_; xx++) { + if (conf.debug_mode) { + cudaMemcpy(h_walk, + walk, + conf.buf_size * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + for (int xx = 0; xx < conf.buf_size; xx++) { VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx]; } } @@ -2687,12 +2857,13 @@ int GraphDataGenerator::FillWalkBuf() { ///////// step++; size_t path_len = path.size(); - for (; step < conf_.walk_len; step++) { + for (; step < conf.walk_len; step++) { if (FLAGS_enable_graph_multi_node_sampling) { // Step synchronization for multi-step sampling in multi node int flag = sample_res.total_sample_size > 0 ? 1 : 0; - int command = multi_node_sync_sample(flag, ncclMax); - VLOG(2) << "gpuid:" << conf_.gpuid << " multi node step sync" + int command = multi_node_sync_sample( + flag, ncclMax, place, multi_node_sync_stat_ptr); + VLOG(2) << "gpuid:" << conf.gpuid << " multi node step sync" << " step:" << step << " step_sample:" << flag << "," << command; if (command <= 0) { @@ -2712,7 +2883,7 @@ int GraphDataGenerator::FillWalkBuf() { sample_keys_ptr = reinterpret_cast(sample_key_mem->ptr()); } int edge_type_id = path[(step - 1) % path_len]; - q.initialize(conf_.gpuid, + q.initialize(conf.gpuid, edge_type_id, (uint64_t)sample_keys_ptr, 1, @@ -2720,25 +2891,23 @@ int GraphDataGenerator::FillWalkBuf() { step); int sample_key_len = sample_res.total_sample_size; sample_res = gpu_graph_ptr->graph_neighbor_sample_v3( - q, false, true, conf_.weighted_sample); + q, false, true, conf.weighted_sample); total_samples += sample_res.total_sample_size; - if (!conf_.sage_mode) { - if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { - if (InsertTable(sample_res.actual_val, - sample_res.total_sample_size, - d_uniq_node_num_, - conf_, - copy_unique_len_, - place_, - table_, - host_vec_, - sample_stream_) != 0) { - VLOG(0) << "gpu:" << conf_.gpuid << " in step: " << step - << ", table is full"; - update = false; - assert(false); - break; - } + if (!conf.sage_mode) { + if (InsertTable(sample_res.actual_val, + sample_res.total_sample_size, + d_uniq_node_num, + conf, + copy_unique_len_ptr, + place, + table, + host_vec_ptr, + stream) != 0) { + VLOG(0) << "gpu:" << conf.gpuid << " in step: " << step + << ", table is full"; + update = false; + assert(false); + break; } } FillOneStep(d_type_keys + start, @@ -2746,20 +2915,20 @@ int GraphDataGenerator::FillWalkBuf() { cur_walk, cur_walk_ntype, sample_key_len, - sample_res, + &sample_res, 1, step, - conf_, - d_sample_keys_, - d_prefix_sum_, - d_sampleidx2rows_, - cur_sampleidx2row_, - place_, - sample_stream_); - if (conf_.debug_mode) { - cudaMemcpy( - h_walk, walk, buf_size_ * sizeof(uint64_t), cudaMemcpyDeviceToHost); - for (int xx = 0; xx < buf_size_; xx++) { + conf, + &d_sampleidx2rows, + &cur_sampleidx2row, + place, + stream); + if (conf.debug_mode) { + cudaMemcpy(h_walk, + walk, + conf.buf_size * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + for (int xx = 0; xx < conf.buf_size; xx++) { VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx]; } } @@ -2767,9 +2936,9 @@ int GraphDataGenerator::FillWalkBuf() { // 此时更新全局采样状态 if (update == true) { - node_type_start[node_type] = tmp_len + start; - i += jump_rows_ * conf_.walk_len; - total_row_ += jump_rows_; + (*node_type_start_ptr)[node_type] = tmp_len + start; + i += *jump_rows_ptr * conf.walk_len; + *total_row_ptr += *jump_rows_ptr; cursor += 1; sample_times++; } else { @@ -2781,107 +2950,101 @@ int GraphDataGenerator::FillWalkBuf() { VLOG(2) << "sample " << sample_times << " finish, node_type=" << node_type << ", path:[" << path[0] << "," << path[1] << "]" << ", start:" << start << ", len:" << tmp_len - << ", row:" << jump_rows_ << ", total_step:" << step + << ", row:" << *jump_rows_ptr << ", total_step:" << step << ", device_key_size:" << device_key_size; } - buf_state_.Reset(total_row_); - int *d_random_row = reinterpret_cast(d_random_row_->ptr()); - int *d_random_row_col_shift = - reinterpret_cast(d_random_row_col_shift_->ptr()); - - paddle::memory::ThrustAllocator allocator(place_, - sample_stream_); - thrust::random::default_random_engine engine(shuffle_seed_); - const auto &exec_policy = thrust::cuda::par(allocator).on(sample_stream_); + buf_state->Reset(*total_row_ptr); + paddle::memory::ThrustAllocator allocator(place, stream); + thrust::random::default_random_engine engine(*shuffle_seed_ptr); + const auto &exec_policy = thrust::cuda::par(allocator).on(stream); thrust::counting_iterator cnt_iter(0); thrust::shuffle_copy(exec_policy, cnt_iter, - cnt_iter + total_row_, + cnt_iter + *total_row_ptr, thrust::device_pointer_cast(d_random_row), engine); thrust::transform(exec_policy, cnt_iter, - cnt_iter + total_row_, + cnt_iter + *total_row_ptr, thrust::device_pointer_cast(d_random_row_col_shift), - RandInt(0, conf_.walk_len)); + RandInt(0, conf.walk_len)); - cudaStreamSynchronize(sample_stream_); - shuffle_seed_ = engine(); + cudaStreamSynchronize(stream); + *shuffle_seed_ptr = engine(); - if (conf_.debug_mode) { - int *h_random_row = new int[total_row_ + 10]; + if (conf.debug_mode) { + int *h_random_row = new int[*total_row_ptr + 10]; cudaMemcpy(h_random_row, d_random_row, - total_row_ * sizeof(int), + *total_row_ptr * sizeof(int), cudaMemcpyDeviceToHost); - for (int xx = 0; xx < total_row_; xx++) { + for (int xx = 0; xx < *total_row_ptr; xx++) { VLOG(2) << "h_random_row[" << xx << "]: " << h_random_row[xx]; } delete[] h_random_row; delete[] h_walk; - delete[] h_sample_keys; - delete[] h_offset2idx; - delete[] h_len_per_row; - delete[] h_prefix_sum; } - if (!conf_.sage_mode) { - uint64_t h_uniq_node_num = CopyUniqueNodes(table_, - copy_unique_len_, - place_, - d_uniq_node_num_, - host_vec_, - sample_stream_); - VLOG(1) << "sample_times:" << sample_times << ", d_walk_size:" << buf_size_ - << ", d_walk_offset:" << i << ", total_rows:" << total_row_ - << ", total_samples:" << total_samples - << ", h_uniq_node_num: " << h_uniq_node_num; - } else { - VLOG(1) << "sample_times:" << sample_times << ", d_walk_size:" << buf_size_ - << ", d_walk_offset:" << i << ", total_rows:" << total_row_ - << ", total_samples:" << total_samples; - } - return total_row_ != 0; + return *total_row_ptr != 0; } -int GraphDataGenerator::FillWalkBufMultiPath() { - platform::CUDADeviceGuard guard(conf_.gpuid); - size_t once_max_sample_keynum = - conf_.walk_degree * conf_.once_sample_startid_len; + +int FillWalkBufMultiPath( + const std::vector &h_device_keys_len, + const std::vector> &meta_path, + const GraphDataGeneratorConfig &conf, + bool *epoch_finish_ptr, + uint64_t *copy_unique_len_ptr, + const paddle::platform::Place &place, + const std::vector &first_node_type, + std::unordered_map *node_type_start_ptr, + uint64_t *walk, // output + uint8_t *walk_ntype, + std::shared_ptr *d_uniq_node_num, + int *d_random_row, + int *d_random_row_col_shift, + std::vector *host_vec_ptr, + int *total_row_ptr, + size_t *jump_rows_ptr, + int *shuffle_seed_ptr, + uint64_t *d_train_metapath_keys, + uint64_t *h_train_metapath_keys_len_ptr, + HashTable *table, + BufState *buf_state, + cudaStream_t stream) { + platform::CUDADeviceGuard guard(conf.gpuid); + //////// uint64_t *h_walk; - uint64_t *h_sample_keys; - int *h_offset2idx; - int *h_len_per_row; - uint64_t *h_prefix_sum; - if (conf_.debug_mode) { - h_walk = new uint64_t[buf_size_]; - h_sample_keys = new uint64_t[once_max_sample_keynum]; - h_offset2idx = new int[once_max_sample_keynum]; - h_len_per_row = new int[once_max_sample_keynum]; - h_prefix_sum = new uint64_t[once_max_sample_keynum + 1]; + if (conf.debug_mode) { + h_walk = new uint64_t[conf.buf_size]; } /////// auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); - uint64_t *walk = reinterpret_cast(d_walk_->ptr()); - uint8_t *walk_ntype = NULL; - if (conf_.need_walk_ntype) { - walk_ntype = reinterpret_cast(d_walk_ntype_->ptr()); + cudaMemsetAsync(walk, 0, conf.buf_size * sizeof(uint64_t), stream); + if (conf.need_walk_ntype) { + cudaMemsetAsync(walk_ntype, 0, conf.buf_size * sizeof(uint8_t), stream); } - uint64_t *d_sample_keys = reinterpret_cast(d_sample_keys_->ptr()); - cudaMemsetAsync(walk, 0, buf_size_ * sizeof(uint64_t), sample_stream_); int sample_times = 0; int i = 0; - total_row_ = 0; + *total_row_ptr = 0; + + std::vector> d_sampleidx2rows; + d_sampleidx2rows.push_back(memory::AllocShared( + place, + conf.once_max_sample_keynum * sizeof(int), + phi::Stream(reinterpret_cast(stream)))); + d_sampleidx2rows.push_back(memory::AllocShared( + place, + conf.once_max_sample_keynum * sizeof(int), + phi::Stream(reinterpret_cast(stream)))); + int cur_sampleidx2row = 0; // 获取全局采样状态 - auto &first_node_type = gpu_graph_ptr->first_node_type_; auto &cur_metapath = gpu_graph_ptr->cur_metapath_; - auto &meta_path = gpu_graph_ptr->meta_path_; auto &path = gpu_graph_ptr->cur_parse_metapath_; - auto &cur_metapath_start = gpu_graph_ptr->cur_metapath_start_[conf_.gpuid]; - auto &finish_node_type = gpu_graph_ptr->finish_node_type_[conf_.gpuid]; + auto &cur_metapath_start = gpu_graph_ptr->cur_metapath_start_[conf.gpuid]; auto &type_to_index = gpu_graph_ptr->get_graph_type_to_index(); size_t node_type_len = first_node_type.size(); std::string first_node = @@ -2889,31 +3052,30 @@ int GraphDataGenerator::FillWalkBufMultiPath() { auto it = gpu_graph_ptr->node_to_id.find(first_node); auto node_type = it->second; - int remain_size = buf_size_ - conf_.walk_degree * - conf_.once_sample_startid_len * - conf_.walk_len; + int remain_size = conf.buf_size - conf.walk_degree * + conf.once_sample_startid_len * + conf.walk_len; int total_samples = 0; while (i <= remain_size) { size_t start = cur_metapath_start; - size_t device_key_size = h_train_metapath_keys_len_; + size_t device_key_size = *h_train_metapath_keys_len_ptr; VLOG(2) << "type: " << node_type << " size: " << device_key_size << " start: " << start; - uint64_t *d_type_keys = - reinterpret_cast(d_train_metapath_keys_->ptr()); - int tmp_len = start + conf_.once_sample_startid_len > device_key_size + uint64_t *d_type_keys = d_train_metapath_keys; + int tmp_len = start + conf.once_sample_startid_len > device_key_size ? device_key_size - start - : conf_.once_sample_startid_len; + : conf.once_sample_startid_len; bool update = true; if (tmp_len == 0) { - epoch_finish_ = true; + *epoch_finish_ptr = true; break; } - VLOG(2) << "gpuid = " << conf_.gpuid << " path[0] = " << path[0]; + VLOG(2) << "gpuid = " << conf.gpuid << " path[0] = " << path[0]; uint64_t *cur_walk = walk + i; uint8_t *cur_walk_ntype = NULL; - if (conf_.need_walk_ntype) { + if (conf.need_walk_ntype) { cur_walk_ntype = walk_ntype + i; } @@ -2921,52 +3083,50 @@ int GraphDataGenerator::FillWalkBufMultiPath() { VLOG(2) << "sample edge type: " << path[0] << " step: " << 1; NeighborSampleQuery q; - q.initialize(conf_.gpuid, + q.initialize(conf.gpuid, path[0], (uint64_t)(d_type_keys + start), - conf_.walk_degree, + conf.walk_degree, tmp_len, step); auto sample_res = gpu_graph_ptr->graph_neighbor_sample_v3( - q, false, true, conf_.weighted_sample); + q, false, true, conf.weighted_sample); - jump_rows_ = sample_res.total_sample_size; + *jump_rows_ptr = sample_res.total_sample_size; total_samples += sample_res.total_sample_size; VLOG(2) << "i = " << i << " start = " << start << " tmp_len = " << tmp_len - << "jump row: " << jump_rows_; - if (jump_rows_ == 0) { + << "jump row: " << *jump_rows_ptr; + if (*jump_rows_ptr == 0) { cur_metapath_start = tmp_len + start; continue; } - if (!conf_.sage_mode) { - if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { - if (InsertTable(d_type_keys + start, - tmp_len, - d_uniq_node_num_, - conf_, - copy_unique_len_, - place_, - table_, - host_vec_, - sample_stream_) != 0) { - VLOG(2) << "in step 0, insert key stage, table is full"; - update = false; - break; - } - if (InsertTable(sample_res.actual_val, - sample_res.total_sample_size, - d_uniq_node_num_, - conf_, - copy_unique_len_, - place_, - table_, - host_vec_, - sample_stream_) != 0) { - VLOG(2) << "in step 0, insert sample res stage, table is full"; - update = false; - break; - } + if (!conf.sage_mode) { + if (InsertTable(d_type_keys + start, + tmp_len, + d_uniq_node_num, + conf, + copy_unique_len_ptr, + place, + table, + host_vec_ptr, + stream) != 0) { + VLOG(2) << "in step 0, insert key stage, table is full"; + update = false; + break; + } + if (InsertTable(sample_res.actual_val, + sample_res.total_sample_size, + d_uniq_node_num, + conf, + copy_unique_len_ptr, + place, + table, + host_vec_ptr, + stream) != 0) { + VLOG(2) << "in step 0, insert sample res stage, table is full"; + update = false; + break; } } @@ -2975,21 +3135,21 @@ int GraphDataGenerator::FillWalkBufMultiPath() { cur_walk, cur_walk_ntype, tmp_len, - sample_res, - conf_.walk_degree, + &sample_res, + conf.walk_degree, step, - conf_, - d_sample_keys_, - d_prefix_sum_, - d_sampleidx2rows_, - cur_sampleidx2row_, - place_, - sample_stream_); + conf, + &d_sampleidx2rows, + &cur_sampleidx2row, + place, + stream); ///////// - if (conf_.debug_mode) { - cudaMemcpy( - h_walk, walk, buf_size_ * sizeof(uint64_t), cudaMemcpyDeviceToHost); - for (int xx = 0; xx < buf_size_; xx++) { + if (conf.debug_mode) { + cudaMemcpy(h_walk, + walk, + conf.buf_size * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + for (int xx = 0; xx < conf.buf_size; xx++) { VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx]; } } @@ -3000,7 +3160,7 @@ int GraphDataGenerator::FillWalkBufMultiPath() { ///////// step++; size_t path_len = path.size(); - for (; step < conf_.walk_len; step++) { + for (; step < conf.walk_len; step++) { if (sample_res.total_sample_size == 0) { VLOG(2) << "sample finish, step=" << step; break; @@ -3010,7 +3170,7 @@ int GraphDataGenerator::FillWalkBufMultiPath() { reinterpret_cast(sample_key_mem->ptr()); int edge_type_id = path[(step - 1) % path_len]; VLOG(2) << "sample edge type: " << edge_type_id << " step: " << step; - q.initialize(conf_.gpuid, + q.initialize(conf.gpuid, edge_type_id, (uint64_t)sample_keys_ptr, 1, @@ -3018,23 +3178,21 @@ int GraphDataGenerator::FillWalkBufMultiPath() { step); int sample_key_len = sample_res.total_sample_size; sample_res = gpu_graph_ptr->graph_neighbor_sample_v3( - q, false, true, conf_.weighted_sample); + q, false, true, conf.weighted_sample); total_samples += sample_res.total_sample_size; - if (!conf_.sage_mode) { - if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { - if (InsertTable(sample_res.actual_val, - sample_res.total_sample_size, - d_uniq_node_num_, - conf_, - copy_unique_len_, - place_, - table_, - host_vec_, - sample_stream_) != 0) { - VLOG(2) << "in step: " << step << ", table is full"; - update = false; - break; - } + if (!conf.sage_mode) { + if (InsertTable(sample_res.actual_val, + sample_res.total_sample_size, + d_uniq_node_num, + conf, + copy_unique_len_ptr, + place, + table, + host_vec_ptr, + stream) != 0) { + VLOG(2) << "in step: " << step << ", table is full"; + update = false; + break; } } FillOneStep(d_type_keys + start, @@ -3042,20 +3200,20 @@ int GraphDataGenerator::FillWalkBufMultiPath() { cur_walk, cur_walk_ntype, sample_key_len, - sample_res, + &sample_res, 1, step, - conf_, - d_sample_keys_, - d_prefix_sum_, - d_sampleidx2rows_, - cur_sampleidx2row_, - place_, - sample_stream_); - if (conf_.debug_mode) { - cudaMemcpy( - h_walk, walk, buf_size_ * sizeof(uint64_t), cudaMemcpyDeviceToHost); - for (int xx = 0; xx < buf_size_; xx++) { + conf, + &d_sampleidx2rows, + &cur_sampleidx2row, + place, + stream); + if (conf.debug_mode) { + cudaMemcpy(h_walk, + walk, + conf.buf_size * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + for (int xx = 0; xx < conf.buf_size; xx++) { VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx]; } } @@ -3066,80 +3224,526 @@ int GraphDataGenerator::FillWalkBufMultiPath() { // 此时更新全局采样状态 if (update == true) { cur_metapath_start = tmp_len + start; - i += jump_rows_ * conf_.walk_len; - total_row_ += jump_rows_; + i += *jump_rows_ptr * conf.walk_len; + *total_row_ptr += *jump_rows_ptr; sample_times++; } else { VLOG(2) << "table is full, not update stat!"; break; } } - buf_state_.Reset(total_row_); - int *d_random_row = reinterpret_cast(d_random_row_->ptr()); - int *d_random_row_col_shift = - reinterpret_cast(d_random_row_col_shift_->ptr()); + buf_state->Reset(*total_row_ptr); - paddle::memory::ThrustAllocator allocator(place_, - sample_stream_); - thrust::random::default_random_engine engine(shuffle_seed_); - const auto &exec_policy = thrust::cuda::par(allocator).on(sample_stream_); + paddle::memory::ThrustAllocator allocator(place, stream); + thrust::random::default_random_engine engine(*shuffle_seed_ptr); + const auto &exec_policy = thrust::cuda::par(allocator).on(stream); thrust::counting_iterator cnt_iter(0); thrust::shuffle_copy(exec_policy, cnt_iter, - cnt_iter + total_row_, + cnt_iter + *total_row_ptr, thrust::device_pointer_cast(d_random_row), engine); thrust::transform(exec_policy, cnt_iter, - cnt_iter + total_row_, + cnt_iter + *total_row_ptr, thrust::device_pointer_cast(d_random_row_col_shift), - RandInt(0, conf_.walk_len)); + RandInt(0, conf.walk_len)); - cudaStreamSynchronize(sample_stream_); - shuffle_seed_ = engine(); + cudaStreamSynchronize(stream); + *shuffle_seed_ptr = engine(); - if (conf_.debug_mode) { - int *h_random_row = new int[total_row_ + 10]; + if (conf.debug_mode) { + int *h_random_row = new int[*total_row_ptr + 10]; cudaMemcpy(h_random_row, d_random_row, - total_row_ * sizeof(int), + *total_row_ptr * sizeof(int), cudaMemcpyDeviceToHost); - for (int xx = 0; xx < total_row_; xx++) { + for (int xx = 0; xx < *total_row_ptr; xx++) { VLOG(2) << "h_random_row[" << xx << "]: " << h_random_row[xx]; } delete[] h_random_row; delete[] h_walk; - delete[] h_sample_keys; - delete[] h_offset2idx; - delete[] h_len_per_row; - delete[] h_prefix_sum; } - if (!conf_.sage_mode) { - uint64_t h_uniq_node_num = CopyUniqueNodes(table_, - copy_unique_len_, - place_, - d_uniq_node_num_, - host_vec_, - sample_stream_); - VLOG(1) << "sample_times:" << sample_times << ", d_walk_size:" << buf_size_ - << ", d_walk_offset:" << i << ", total_rows:" << total_row_ - << ", h_uniq_node_num:" << h_uniq_node_num - << ", total_samples:" << total_samples; + return *total_row_ptr != 0; +} + +void GraphDataGenerator::DoWalkandSage() { + int device_id = place_.GetDeviceId(); + debug_gpu_memory_info(device_id, "DoWalkandSage start"); + platform::CUDADeviceGuard guard(conf_.gpuid); + sage_batch_num_ = 0; + if (conf_.gpu_graph_training) { + bool train_flag = DoWalkForTrain(); + if (train_flag && conf_.sage_mode) { + DoSageForTrain(); + } } else { - VLOG(1) << "sample_times:" << sample_times << ", d_walk_size:" << buf_size_ - << ", d_walk_offset:" << i << ", total_rows:" << total_row_ - << ", total_samples:" << total_samples; + bool infer_flag = DoWalkForInfer(); + if (infer_flag && conf_.sage_mode) { + DoSageForInfer(); + } + } + if (conf_.gpu_graph_training || conf_.sage_mode) { + CopyUniqueNodes(table_, copy_unique_len_, place_, d_uniq_node_num_, + &host_vec_, sample_stream_); + } +} + +bool GraphDataGenerator::DoWalkForTrain() { + bool train_flag = true; + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + uint8_t *walk_ntype = NULL; + if (conf_.need_walk_ntype) { + walk_ntype = reinterpret_cast(d_walk_ntype_[tensor_pair_idx]->ptr()); + } + + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + if (FLAGS_graph_metapath_split_opt) { + train_flag &= FillWalkBufMultiPath( + h_device_keys_len_[tensor_pair_idx], + gpu_graph_ptr->meta_path_[tensor_pair_idx], + conf_, + &epoch_finish_, + ©_unique_len_, + place_, + gpu_graph_ptr->first_node_type_[tensor_pair_idx], + &(gpu_graph_ptr->node_type_start_[tensor_pair_idx][conf_.gpuid]), + reinterpret_cast(d_walk_[tensor_pair_idx]->ptr()), + walk_ntype, + &d_uniq_node_num_, + reinterpret_cast(d_random_row_[tensor_pair_idx]->ptr()), + reinterpret_cast(d_random_row_col_shift_[tensor_pair_idx]->ptr()), + &host_vec_, + &total_row_[tensor_pair_idx], + &jump_rows_[tensor_pair_idx], + &shuffle_seed_[tensor_pair_idx], + reinterpret_cast(d_train_metapath_keys_[tensor_pair_idx]->ptr()), + &h_train_metapath_keys_len_[tensor_pair_idx], + table_, + &buf_state_[tensor_pair_idx], + sample_stream_); + } else { + train_flag &= FillWalkBuf(h_device_keys_len_[tensor_pair_idx], + d_device_keys_[tensor_pair_idx], + gpu_graph_ptr->meta_path_[tensor_pair_idx], + conf_, + &epoch_finish_, + ©_unique_len_, + place_, + gpu_graph_ptr->first_node_type_[tensor_pair_idx], + &(gpu_graph_ptr->node_type_start_[tensor_pair_idx][conf_.gpuid]), + &(gpu_graph_ptr->finish_node_type_[tensor_pair_idx][conf_.gpuid]), + reinterpret_cast(d_walk_[tensor_pair_idx]->ptr()), + walk_ntype, + &d_uniq_node_num_, + reinterpret_cast(d_random_row_[tensor_pair_idx]->ptr()), + reinterpret_cast(d_random_row_col_shift_[tensor_pair_idx]->ptr()), + &multi_node_sync_stat_, + &host_vec_, + &total_row_[tensor_pair_idx], + &jump_rows_[tensor_pair_idx], + &shuffle_seed_[tensor_pair_idx], + table_, + &buf_state_[tensor_pair_idx], + sample_stream_); + } + } + + return train_flag; +} + +void GraphDataGenerator::DoSageForTrain() { + int total_instance = 0, uniq_instance = 0; + bool is_sage_pass_continue = true; + int sage_pass_end = 0; + uint64_t *ins_buf, *ins_cursor; + while (is_sage_pass_continue) { + for (int tensor_pair_idx = 0; + tensor_pair_idx < conf_.tensor_pair_num && is_sage_pass_continue; + ++tensor_pair_idx) { + while (ins_buf_pair_len_[tensor_pair_idx] < conf_.batch_size) { + int32_t *pair_label_buf = NULL; + if (conf_.enable_pair_label) { + pair_label_buf = + reinterpret_cast(d_pair_label_buf_[tensor_pair_idx]->ptr()); + } + int res = FillInsBuf(d_walk_[tensor_pair_idx], + d_walk_ntype_[tensor_pair_idx], + conf_, + d_random_row_[tensor_pair_idx], + d_random_row_col_shift_[tensor_pair_idx], + &buf_state_[tensor_pair_idx], + reinterpret_cast(d_ins_buf_[tensor_pair_idx]->ptr()), + pair_label_buf, + reinterpret_cast(d_pair_num_[tensor_pair_idx]->ptr()), + &ins_buf_pair_len_[tensor_pair_idx], + sample_stream_); + if (res == -1) { + if (ins_buf_pair_len_[tensor_pair_idx] == 0) { + if (conf_.is_multi_node) { + sage_pass_end = 1; + if (total_row_[tensor_pair_idx] != 0) { + buf_state_[tensor_pair_idx].Reset(total_row_[tensor_pair_idx]); + VLOG(1) << "reset buf state to make batch num equal in " + "multi node"; + } + } else { + is_sage_pass_continue = false; + break; + } + } else { + break; + } + } + } // end while (ins_buf_pair_len_[tensor_pair_idx] < conf_.batch_size) + + // check whether reach sage pass end + if (conf_.is_multi_node) { + int res = multi_node_sync_sample( + sage_pass_end, ncclProd, place_, &multi_node_sync_stat_); + if (res) { + is_sage_pass_continue = false; + break; + } + } + + total_instance = ins_buf_pair_len_[tensor_pair_idx] < conf_.batch_size ? + ins_buf_pair_len_[tensor_pair_idx] : conf_.batch_size; + total_instance *= 2; + + ins_buf = reinterpret_cast(d_ins_buf_[tensor_pair_idx]->ptr()); + ins_cursor = ins_buf + ins_buf_pair_len_[tensor_pair_idx] * 2 - total_instance; + auto final_sage_nodes = GenerateSampleGraph(ins_cursor, + total_instance, + &uniq_instance, + conf_, + &inverse_vec_, + &graph_edges_vec_, + &edges_split_num_vec_, + &edge_type_graph_, + place_, + sample_stream_); + final_sage_nodes_vec_.emplace_back(final_sage_nodes); + uniq_instance_vec_.emplace_back(uniq_instance); + total_instance_vec_.emplace_back(total_instance); + + if (conf_.get_degree) { + auto node_degrees = GetNodeDegree(reinterpret_cast(final_sage_nodes->ptr()), + uniq_instance, + conf_, + place_, + sample_stream_); + node_degree_vec_.emplace_back(node_degrees); + } + + if (conf_.enable_pair_label) { + auto pair_label = memory::AllocShared( + place_, + total_instance / 2 * sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_))); + int32_t *pair_label_buf = + reinterpret_cast(d_pair_label_buf_[tensor_pair_idx]->ptr()); + int32_t *pair_label_cursor = + pair_label_buf + ins_buf_pair_len_[tensor_pair_idx] - total_instance / 2; + cudaMemcpyAsync(pair_label->ptr(), + pair_label_cursor, + sizeof(int32_t) * total_instance / 2, + cudaMemcpyDeviceToDevice, + sample_stream_); + pair_label_vec_.emplace_back(pair_label); + } + + cudaStreamSynchronize(sample_stream_); + InsertTable(reinterpret_cast(final_sage_nodes->ptr()), + uniq_instance, + &d_uniq_node_num_, + conf_, + ©_unique_len_, + place_, + table_, + &host_vec_, + sample_stream_); + + ins_buf_pair_len_[tensor_pair_idx] -= total_instance / 2; + } // end for (int tensor_pair_idx = 0; + if (is_sage_pass_continue) { + sage_batch_num_ += 1; + } + } // end while (is_sage_pass_continue) +} + +void GraphDataGenerator::DoSageForInfer() { + // Set new batch size for multi_node + if (conf_.is_multi_node) { + int new_batch_size = dynamic_adjust_batch_num_for_sage(); + conf_.batch_size = new_batch_size; + } + + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + int total_instance = 0, uniq_instance = 0; + total_instance = + (infer_node_start_[tensor_pair_idx] + conf_.batch_size <= infer_node_end_[tensor_pair_idx]) + ? conf_.batch_size + : infer_node_end_[tensor_pair_idx] - infer_node_start_[tensor_pair_idx]; + total_instance *= 2; + while (total_instance != 0) { + uint64_t *d_type_keys = reinterpret_cast( + d_device_keys_[tensor_pair_idx][infer_cursor_[tensor_pair_idx]]->ptr()); + d_type_keys += infer_node_start_[tensor_pair_idx]; + infer_node_start_[tensor_pair_idx] += total_instance / 2; + auto node_buf = memory::AllocShared( + place_, + total_instance * sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + int64_t *node_buf_ptr = reinterpret_cast(node_buf->ptr()); + CopyDuplicateKeys<<>>( + node_buf_ptr, d_type_keys, total_instance / 2); + auto final_sage_nodes = GenerateSampleGraph(reinterpret_cast(node_buf->ptr()), + total_instance, + &uniq_instance, + conf_, + &inverse_vec_, + &graph_edges_vec_, + &edges_split_num_vec_, + &edge_type_graph_, + place_, + sample_stream_); + final_sage_nodes_vec_.emplace_back(final_sage_nodes); + uniq_instance_vec_.emplace_back(uniq_instance); + total_instance_vec_.emplace_back(total_instance); + + if (conf_.get_degree) { + auto node_degrees = GetNodeDegree(reinterpret_cast(final_sage_nodes->ptr()), + uniq_instance, + conf_, + place_, + sample_stream_); + node_degree_vec_.emplace_back(node_degrees); + } + cudaStreamSynchronize(sample_stream_); + InsertTable(reinterpret_cast(final_sage_nodes->ptr()), + uniq_instance, + &d_uniq_node_num_, + conf_, + ©_unique_len_, + place_, + table_, + &host_vec_, + sample_stream_); + + total_instance = + (infer_node_start_[tensor_pair_idx] + conf_.batch_size <= infer_node_end_[tensor_pair_idx]) + ? conf_.batch_size + : infer_node_end_[tensor_pair_idx] - infer_node_start_[tensor_pair_idx]; + total_instance *= 2; + sage_batch_num_ += 1; + } // end while (total_instance != 0) + } // end for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + + sage_batch_num_ /= conf_.tensor_pair_num; +} + +void GraphDataGenerator::clear_gpu_mem() { + platform::CUDADeviceGuard guard(conf_.gpuid); + delete table_; +} + +int dynamic_adjust_total_row_for_infer(int local_reach_end, + const paddle::platform::Place &place, + cudaStream_t stream) { + auto send_buff = memory::Alloc( + place, + 2 * sizeof(int), + phi::Stream(reinterpret_cast(stream))); + int *send_buff_ptr = reinterpret_cast(send_buff->ptr()); + cudaMemcpyAsync(send_buff_ptr, + &local_reach_end, + sizeof(int), + cudaMemcpyHostToDevice, + stream); + cudaStreamSynchronize(stream); + auto comm = + platform::NCCLCommContext::Instance().Get(0, place.GetDeviceId()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&send_buff_ptr[0], + &send_buff_ptr[1], + 1, + ncclInt, + ncclProd, + comm->comm(), + stream)); + int global_reach_end = 0; + cudaMemcpyAsync(&global_reach_end, + &send_buff_ptr[1], + sizeof(int), + cudaMemcpyDeviceToHost, + stream); + cudaStreamSynchronize(stream); + return global_reach_end; +} + +bool FillInferBuf(const std::vector &h_device_keys_len, // input + const std::vector> &d_device_keys, + const GraphDataGeneratorConfig &conf, + int tensor_pair_idx, + int *total_row_ptr, + size_t *infer_node_start_ptr, + size_t *infer_node_end_ptr, + size_t *infer_cursor_ptr, + std::vector *host_vec_ptr, // output + const paddle::platform::Place &place, + cudaStream_t stream) { + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + auto &global_infer_node_type_start = + gpu_graph_ptr->global_infer_node_type_start_[conf.gpuid]; + auto &infer_cursor = gpu_graph_ptr->infer_cursor_[tensor_pair_idx][conf.thread_id]; + *total_row_ptr = 0; + if (infer_cursor < h_device_keys_len.size()) { + while (global_infer_node_type_start[infer_cursor] >= + h_device_keys_len[infer_cursor]) { + infer_cursor++; + if (infer_cursor >= h_device_keys_len.size()) { + return false; + } + } + if (!conf.infer_node_type_index_set.empty()) { + while (infer_cursor < h_device_keys_len.size()) { + if (conf.infer_node_type_index_set.find(infer_cursor) == + conf.infer_node_type_index_set.end()) { + VLOG(2) << "Skip cursor[" << infer_cursor << "]"; + infer_cursor++; + continue; + } else { + VLOG(2) << "Not skip cursor[" << infer_cursor << "]"; + break; + } + } + if (infer_cursor >= h_device_keys_len.size()) { + return false; + } + } + + size_t device_key_size = h_device_keys_len[infer_cursor]; + if (conf.is_multi_node) { + int local_reach_end = global_infer_node_type_start[infer_cursor] + conf.buf_size >= + device_key_size; + int global_reach_end = dynamic_adjust_total_row_for_infer(local_reach_end, place, stream); + int remain = device_key_size - global_infer_node_type_start[infer_cursor]; + if (global_reach_end) { + *total_row_ptr = remain; + } else { + if (local_reach_end) { + *total_row_ptr = remain / 2; + } else { + *total_row_ptr = conf.buf_size; + } + } + } else { + *total_row_ptr = + (global_infer_node_type_start[infer_cursor] + conf.buf_size <= + device_key_size) + ? conf.buf_size + : device_key_size - global_infer_node_type_start[infer_cursor]; + } + + uint64_t *d_type_keys = + reinterpret_cast(d_device_keys[infer_cursor]->ptr()); + if (!conf.sage_mode) { + host_vec_ptr->resize(*total_row_ptr); + cudaMemcpyAsync(host_vec_ptr->data(), + d_type_keys + global_infer_node_type_start[infer_cursor], + sizeof(uint64_t) * *total_row_ptr, + cudaMemcpyDeviceToHost, + stream); + cudaStreamSynchronize(stream); + } + VLOG(1) << "gpuid: " << conf.gpuid + << " cursor: " << infer_cursor + << " start: " << global_infer_node_type_start[infer_cursor] + << " num: " << *total_row_ptr + << " device_key_size: " << device_key_size; + *infer_node_start_ptr = global_infer_node_type_start[infer_cursor]; + global_infer_node_type_start[infer_cursor] += *total_row_ptr; + *infer_node_end_ptr = global_infer_node_type_start[infer_cursor]; + *infer_cursor_ptr = infer_cursor; + return true; } - return total_row_ != 0; + return false; +} + +bool GraphDataGenerator::DoWalkForInfer() { + platform::CUDADeviceGuard guard(conf_.gpuid); + bool infer_flag = true; + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + infer_flag &= FillInferBuf(h_device_keys_len_[tensor_pair_idx], + d_device_keys_[tensor_pair_idx], + conf_, + tensor_pair_idx, + &total_row_[tensor_pair_idx], + &infer_node_start_[tensor_pair_idx], + &infer_node_end_[tensor_pair_idx], + &infer_cursor_[tensor_pair_idx], + &host_vec_, + place_, + sample_stream_); + cudaStreamSynchronize(sample_stream_); + } + + return infer_flag; +} + +void GraphDataGenerator::ClearSampleState() { + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + auto &finish_node_type = gpu_graph_ptr->finish_node_type_[tensor_pair_idx][conf_.gpuid]; + auto &node_type_start = gpu_graph_ptr->node_type_start_[tensor_pair_idx][conf_.gpuid]; + finish_node_type.clear(); + for (auto iter = node_type_start.begin(); iter != node_type_start.end(); + iter++) { + iter->second = 0; + } + } } void GraphDataGenerator::SetFeedVec(std::vector feed_vec) { feed_vec_ = feed_vec; } +void GraphDataGenerator::SetFeedInfo(std::vector* feed_info) { + feed_info_ = feed_info; + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + int feed_vec_idx = 2 + tensor_pair_idx * conf_.tensor_num_of_one_pair; + ++feed_vec_idx; + if (conf_.enable_pair_label) { + ++feed_vec_idx; + } + for (int i = 0; i < conf_.slot_num; i++) { + int offset = feed_vec_idx + 2 * i; + if ((*feed_info_)[offset].type[0] == 'f') { // float feature + // if (first_float_idx_ == -1) { + // first_float_idx_ = i; + // } + float_slot_num_++; + } else if ((*feed_info_)[offset].type[0] == 'u') { // slot feature + // if (first_slot_idx_ == -1) { + // first_slot_idx_ = i; + // } + uint_slot_num_++; + } + } + } +} + void GraphDataGenerator::AllocResource( int thread_id, std::vector feed_vec) { auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); @@ -3168,11 +3772,11 @@ void GraphDataGenerator::AllocResource( << " feed_vec.size: " << feed_vec.size() << " table cap: " << conf_.train_table_cap; - is_multi_node_ = false; + conf_.is_multi_node = false; #if defined(PADDLE_WITH_GLOO) auto gloo = paddle::framework::GlooWrapper::GetInstance(); if (gloo->Size() > 1) { - is_multi_node_ = true; + conf_.is_multi_node = true; } #endif @@ -3192,49 +3796,54 @@ void GraphDataGenerator::AllocResource( // stream_)); // } if (conf_.gpu_graph_training && FLAGS_graph_metapath_split_opt) { - d_train_metapath_keys_ = - gpu_graph_ptr->d_node_iter_graph_metapath_keys_[thread_id]; - h_train_metapath_keys_len_ = - gpu_graph_ptr->h_node_iter_graph_metapath_keys_len_[thread_id]; - VLOG(2) << "h train metapaths key len: " << h_train_metapath_keys_len_; + d_train_metapath_keys_.resize(conf_.tensor_pair_num); + h_train_metapath_keys_len_.resize(conf_.tensor_pair_num); + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + d_train_metapath_keys_[tensor_pair_idx] = + gpu_graph_ptr->d_node_iter_graph_metapath_keys_[thread_id]; + h_train_metapath_keys_len_[tensor_pair_idx] = + gpu_graph_ptr->h_node_iter_graph_metapath_keys_len_[thread_id]; + VLOG(2) << "h train metapaths key len: " << h_train_metapath_keys_len_[tensor_pair_idx]; + } } else { - auto &d_graph_all_type_keys = - gpu_graph_ptr->d_node_iter_graph_all_type_keys_; - auto &h_graph_all_type_keys_len = - gpu_graph_ptr->h_node_iter_graph_all_type_keys_len_; - - for (size_t i = 0; i < d_graph_all_type_keys.size(); i++) { - d_device_keys_.push_back(d_graph_all_type_keys[i][thread_id]); - h_device_keys_len_.push_back(h_graph_all_type_keys_len[i][thread_id]); + d_device_keys_.resize(conf_.tensor_pair_num); + h_device_keys_len_.resize(conf_.tensor_pair_num); + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + auto &d_graph_all_type_keys = + gpu_graph_ptr->d_node_iter_graph_all_type_keys_; + auto &h_graph_all_type_keys_len = + gpu_graph_ptr->h_node_iter_graph_all_type_keys_len_; + + for (size_t i = 0; i < d_graph_all_type_keys.size(); i++) { + d_device_keys_[tensor_pair_idx].push_back(d_graph_all_type_keys[i][thread_id]); + h_device_keys_len_[tensor_pair_idx].push_back(h_graph_all_type_keys_len[i][thread_id]); + } + VLOG(2) << "h_device_keys size: " << h_device_keys_len_[tensor_pair_idx].size(); } - VLOG(2) << "h_device_keys size: " << h_device_keys_len_.size(); } - size_t once_max_sample_keynum = - conf_.walk_degree * conf_.once_sample_startid_len; - d_prefix_sum_ = memory::AllocShared( - place_, - (once_max_sample_keynum + 1) * sizeof(int), - phi::Stream(reinterpret_cast(sample_stream_))); - int *d_prefix_sum_ptr = reinterpret_cast(d_prefix_sum_->ptr()); - cudaMemsetAsync(d_prefix_sum_ptr, - 0, - (once_max_sample_keynum + 1) * sizeof(int), - sample_stream_); - infer_cursor_ = 0; - jump_rows_ = 0; - d_uniq_node_num_ = memory::AllocShared( - place_, - sizeof(uint64_t), - phi::Stream(reinterpret_cast(sample_stream_))); + infer_cursor_.assign(conf_.tensor_pair_num, 0); + jump_rows_.assign(conf_.tensor_pair_num, 0); + d_uniq_node_num_ = memory::AllocShared(place_, + sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); cudaMemsetAsync(d_uniq_node_num_->ptr(), 0, sizeof(uint64_t), sample_stream_); - d_walk_ = memory::AllocShared( - place_, - buf_size_ * sizeof(uint64_t), - phi::Stream(reinterpret_cast(sample_stream_))); - cudaMemsetAsync( - d_walk_->ptr(), 0, buf_size_ * sizeof(uint64_t), sample_stream_); + total_row_.assign(conf_.tensor_pair_num, 0); + infer_node_start_.assign(conf_.tensor_pair_num, 0); + infer_node_end_.assign(conf_.tensor_pair_num, 0); + d_walk_.resize(conf_.tensor_pair_num); + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + d_walk_[tensor_pair_idx] = memory::AllocShared( + place_, + conf_.buf_size * sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + cudaMemsetAsync( + d_walk_[tensor_pair_idx]->ptr(), 0, conf_.buf_size * sizeof(uint64_t), sample_stream_); + } conf_.excluded_train_pair_len = gpu_graph_ptr->excluded_train_pair_.size(); if (conf_.excluded_train_pair_len > 0) { @@ -3249,59 +3858,58 @@ void GraphDataGenerator::AllocResource( sample_stream_)); } - d_sample_keys_ = memory::AllocShared( - place_, - once_max_sample_keynum * sizeof(uint64_t), - phi::Stream(reinterpret_cast(sample_stream_))); - - d_sampleidx2rows_.push_back(memory::AllocShared( - place_, - once_max_sample_keynum * sizeof(int), - phi::Stream(reinterpret_cast(sample_stream_)))); - d_sampleidx2rows_.push_back(memory::AllocShared( - place_, - once_max_sample_keynum * sizeof(int), - phi::Stream(reinterpret_cast(sample_stream_)))); - cur_sampleidx2row_ = 0; - for (int i = -conf_.window; i < 0; i++) { conf_.window_step.push_back(i); } for (int i = 0; i < conf_.window; i++) { conf_.window_step.push_back(i + 1); } - buf_state_.Init(conf_.batch_size, conf_.walk_len, &conf_.window_step); - d_random_row_ = memory::AllocShared( - place_, - (conf_.once_sample_startid_len * conf_.walk_degree * repeat_time_) * - sizeof(int), - phi::Stream(reinterpret_cast(sample_stream_))); - - d_random_row_col_shift_ = memory::AllocShared( - place_, - (conf_.once_sample_startid_len * conf_.walk_degree * repeat_time_) * - sizeof(int), - phi::Stream(reinterpret_cast(sample_stream_))); - - shuffle_seed_ = 0; + buf_state_.resize(conf_.tensor_pair_num); + d_random_row_.resize(conf_.tensor_pair_num); + d_random_row_col_shift_.resize(conf_.tensor_pair_num); + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + buf_state_[tensor_pair_idx].Init(conf_.batch_size, conf_.walk_len, &conf_.window_step); + d_random_row_[tensor_pair_idx] = memory::AllocShared( + place_, + (conf_.once_sample_startid_len * conf_.walk_degree * repeat_time_) * + sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_))); + + d_random_row_col_shift_[tensor_pair_idx] = memory::AllocShared( + place_, + (conf_.once_sample_startid_len * conf_.walk_degree * repeat_time_) * + sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_))); + } - ins_buf_pair_len_ = 0; - d_ins_buf_ = memory::AllocShared( - place_, - (conf_.batch_size * 2 * 2) * sizeof(uint64_t), - phi::Stream(reinterpret_cast(sample_stream_))); - d_pair_num_ = memory::AllocShared( - place_, - sizeof(int), - phi::Stream(reinterpret_cast(sample_stream_))); + shuffle_seed_.assign(conf_.tensor_pair_num, 0); + ins_buf_pair_len_.assign(conf_.tensor_pair_num, 0); + d_ins_buf_.resize(conf_.tensor_pair_num); + d_pair_num_.resize(conf_.tensor_pair_num); + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + d_ins_buf_[tensor_pair_idx] = memory::AllocShared( + place_, + (conf_.batch_size * 2 * 2) * sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + d_pair_num_[tensor_pair_idx] = memory::AllocShared( + place_, + sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_))); + } conf_.enable_pair_label = conf_.gpu_graph_training && gpu_graph_ptr->pair_label_conf_.size() > 0; if (conf_.enable_pair_label) { conf_.node_type_num = gpu_graph_ptr->id_to_feature.size(); - d_pair_label_buf_ = memory::AllocShared( - place_, - (conf_.batch_size * 2) * sizeof(int32_t), - phi::Stream(reinterpret_cast(sample_stream_))); + d_pair_label_buf_.resize(conf_.tensor_pair_num); + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + d_pair_label_buf_[tensor_pair_idx] = memory::AllocShared( + place_, + (conf_.batch_size * 2) * sizeof(int32_t), + phi::Stream(reinterpret_cast(sample_stream_))); + } conf_.d_pair_label_conf = memory::AllocShared( place_, conf_.node_type_num * conf_.node_type_num * sizeof(int32_t), @@ -3320,12 +3928,18 @@ void GraphDataGenerator::AllocResource( conf_.need_walk_ntype = conf_.excluded_train_pair_len > 0 || conf_.enable_pair_label; if (conf_.need_walk_ntype) { - d_walk_ntype_ = memory::AllocShared( - place_, - buf_size_ * sizeof(uint8_t), - phi::Stream(reinterpret_cast(sample_stream_))); - cudaMemsetAsync( - d_walk_ntype_->ptr(), 0, buf_size_ * sizeof(uint8_t), sample_stream_); + d_walk_ntype_.resize(conf_.tensor_pair_num); + for (int tensor_pair_idx = 0; tensor_pair_idx < conf_.tensor_pair_num; + ++tensor_pair_idx) { + d_walk_ntype_[tensor_pair_idx] = memory::AllocShared( + place_, + conf_.buf_size * sizeof(uint8_t), + phi::Stream(reinterpret_cast(sample_stream_))); + cudaMemsetAsync(d_walk_ntype_[tensor_pair_idx]->ptr(), + 0, + conf_.buf_size * sizeof(uint8_t), + sample_stream_); + } } if (!conf_.sage_mode) { @@ -3335,8 +3949,23 @@ void GraphDataGenerator::AllocResource( : id_offset_of_feed_vec_ + 1; int sample_offset = conf_.return_weight ? 6 : 5; conf_.slot_num = - (feed_vec.size() - offset - samples_.size() * sample_offset) / 2; + (feed_vec.size() - offset - conf_.samples.size() * sample_offset) / 2; + } + VLOG(1) << "slot_num[" << conf_.slot_num << "]"; + conf_.tensor_num_of_one_pair = 1 + conf_.slot_num * 2; // id and slot + if (conf_.sage_mode) { + conf_.tensor_num_of_one_pair += 5 * conf_.samples.size() + 1; // sage[] and inverse_index + } + if (conf_.enable_pair_label) { + conf_.tensor_num_of_one_pair++; + } + if (conf_.get_degree) { + conf_.tensor_num_of_one_pair++; + } + if (conf_.return_weight) { + conf_.tensor_num_of_one_pair += conf_.samples.size(); } + VLOG(1) << "tensor_num_of_one_pair[" << conf_.tensor_num_of_one_pair << "]"; d_slot_tensor_ptr_ = memory::AllocShared( place_, @@ -3350,15 +3979,15 @@ void GraphDataGenerator::AllocResource( if (conf_.sage_mode) { conf_.reindex_table_size = conf_.batch_size * 2; // get hashtable size - for (int i = 0; i < samples_.size(); i++) { - conf_.reindex_table_size *= (samples_[i] * edge_to_id_len_ + 1); + for (int i = 0; i < conf_.samples.size(); i++) { + conf_.reindex_table_size *= (conf_.samples[i] * conf_.edge_to_id_len + 1); } int64_t next_pow2 = 1 << static_cast(1 + std::log2(conf_.reindex_table_size >> 1)); conf_.reindex_table_size = next_pow2 << 1; edge_type_graph_ = - gpu_graph_ptr->get_edge_type_graph(conf_.gpuid, edge_to_id_len_); + gpu_graph_ptr->get_edge_type_graph(conf_.gpuid, conf_.edge_to_id_len); } // parse infer_node_type @@ -3378,10 +4007,10 @@ void GraphDataGenerator::AllocResource( VLOG(2) << "add node[" << type << "] into infer_node_type, type_index(cursor)[" << type_index << "]"; - infer_node_type_index_set_.insert(type_index); + conf_.infer_node_type_index_set.insert(type_index); } VLOG(2) << "infer_node_type_index_set_num: " - << infer_node_type_index_set_.size(); + << conf_.infer_node_type_index_set.size(); } int *stat_ptr = @@ -3420,8 +4049,8 @@ void GraphDataGenerator::SetConfig( conf_.batch_size = conf_.once_sample_startid_len; } repeat_time_ = graph_config.sample_times_one_chunk(); - buf_size_ = conf_.once_sample_startid_len * conf_.walk_len * - conf_.walk_degree * repeat_time_; + conf_.buf_size = conf_.once_sample_startid_len * conf_.walk_len * + conf_.walk_degree * repeat_time_; conf_.train_table_cap = graph_config.train_table_cap(); conf_.infer_table_cap = graph_config.infer_table_cap(); conf_.get_degree = graph_config.get_degree(); @@ -3446,21 +4075,25 @@ void GraphDataGenerator::SetConfig( meta_path, graph_config.excluded_train_pair(), graph_config.pair_label()); + conf_.tensor_pair_num = gpu_graph_ptr->tensor_pair_num_; debug_gpu_memory_info("init_conf end"); auto edge_to_id = gpu_graph_ptr->edge_to_id; - edge_to_id_len_ = edge_to_id.size(); + conf_.edge_to_id_len = edge_to_id.size(); sage_batch_count_ = 0; auto samples = paddle::string::split_string(str_samples, ";"); for (size_t i = 0; i < samples.size(); i++) { int sample_size = std::stoi(samples[i]); - samples_.emplace_back(sample_size); + conf_.samples.emplace_back(sample_size); } copy_unique_len_ = 0; if (!conf_.gpu_graph_training) { infer_node_type_ = graph_config.infer_node_type(); } + + conf_.once_max_sample_keynum = + conf_.walk_degree * conf_.once_sample_startid_len; } #endif @@ -3479,14 +4112,14 @@ void GraphDataGenerator::DumpWalkPath(std::string dump_path, size_t dump_rate) { "the dump rate[1, 10000000]")); int err_no = 0; std::shared_ptr fp = fs_open_append_write(dump_path, &err_no, ""); - uint64_t *h_walk = new uint64_t[buf_size_]; - uint64_t *walk = reinterpret_cast(d_walk_->ptr()); + uint64_t *h_walk = new uint64_t[conf_.buf_size]; + uint64_t *walk = reinterpret_cast(d_walk_[0]->ptr()); cudaMemcpy( - h_walk, walk, buf_size_ * sizeof(uint64_t), cudaMemcpyDeviceToHost); - VLOG(1) << "DumpWalkPath all buf_size_:" << buf_size_; + h_walk, walk, conf_.buf_size * sizeof(uint64_t), cudaMemcpyDeviceToHost); + VLOG(1) << "DumpWalkPath all conf_.buf_size:" << conf_.buf_size; std::string ss = ""; size_t write_count = 0; - for (int xx = 0; xx < buf_size_ / dump_rate; xx += conf_.walk_len) { + for (int xx = 0; xx < conf_.buf_size / dump_rate; xx += conf_.walk_len) { ss = ""; for (int yy = 0; yy < conf_.walk_len; yy++) { ss += std::to_string(h_walk[xx + yy]) + "-"; @@ -3500,37 +4133,13 @@ void GraphDataGenerator::DumpWalkPath(std::string dump_path, size_t dump_rate) { #endif } -int GraphDataGenerator::multi_node_sync_sample(int flag, - const ncclRedOp_t &op) { - if (flag < 0 && flag > 2) { - VLOG(0) << "invalid flag! " << flag; - assert(false); - return -1; - } - - int ret = 0; -#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_GPU_GRAPH) - int *stat_ptr = multi_node_sync_stat_.data(); - auto comm = - platform::NCCLCommContext::Instance().Get(0, place_.GetDeviceId()); - auto stream = comm->stream(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - &stat_ptr[flag], &stat_ptr[3], 1, ncclInt, op, comm->comm(), stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&ret, // output - &stat_ptr[3], - sizeof(int), - cudaMemcpyDeviceToHost, - stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); -#endif - return ret; -} - int GraphDataGenerator::dynamic_adjust_batch_num_for_sage() { - int batch_num = (total_row_ + conf_.batch_size - 1) / conf_.batch_size; - auto send_buff = memory::Alloc(place_, 2 * sizeof(int), - phi::Stream(reinterpret_cast(sample_stream_))); - int* send_buff_ptr = reinterpret_cast(send_buff->ptr()); + int batch_num = (total_row_[0] + conf_.batch_size - 1) / conf_.batch_size; + auto send_buff = memory::Alloc( + place_, + 2 * sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_))); + int *send_buff_ptr = reinterpret_cast(send_buff->ptr()); cudaMemcpyAsync(send_buff_ptr, &batch_num, sizeof(int), @@ -3554,10 +4163,11 @@ int GraphDataGenerator::dynamic_adjust_batch_num_for_sage() { sample_stream_); cudaStreamSynchronize(sample_stream_); - int new_batch_size = (total_row_ + thread_max_batch_num - 1) / thread_max_batch_num; + int new_batch_size = + (total_row_[0] + thread_max_batch_num - 1) / thread_max_batch_num; VLOG(2) << conf_.gpuid << " dynamic adjust sage batch num " - << " max_batch_num: " << thread_max_batch_num - << " new_batch_size: " << new_batch_size; + << " max_batch_num: " << thread_max_batch_num + << " new_batch_size: " << new_batch_size; return new_batch_size; } diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 8842037fa45a54..eaff61d12df7aa 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -908,6 +908,7 @@ struct GraphDataGeneratorConfig { bool get_degree; bool weighted_sample; bool return_weight; + bool is_multi_node; int batch_size; int slot_num; int walk_degree; @@ -919,12 +920,19 @@ struct GraphDataGeneratorConfig { int node_type_num; int debug_mode; int excluded_train_pair_len; + int edge_to_id_len; + int tensor_pair_num; + uint32_t tensor_num_of_one_pair; + size_t buf_size; + size_t once_max_sample_keynum; int64_t reindex_table_size; uint64_t train_table_cap; uint64_t infer_table_cap; std::vector window_step; + std::vector samples; std::shared_ptr d_excluded_train_pair; std::shared_ptr d_pair_label_conf; + std::set infer_node_type_index_set; }; class GraphDataGenerator { @@ -935,10 +943,8 @@ class GraphDataGenerator { void AllocResource(int thread_id, std::vector feed_vec); void AllocTrainResource(int thread_id); void SetFeedVec(std::vector feed_vec); + void SetFeedInfo(std::vector* feed_info); int GenerateBatch(); - int FillWalkBuf(); - int FillWalkBufMultiPath(); - int FillInferBuf(); void DoWalkandSage(); int FillSlotFeature(uint64_t* d_walk); int FillIdShowClkTensor(int total_instance, bool gpu_graph_training); @@ -949,20 +955,19 @@ class GraphDataGenerator { int total_instance, bool gpu_graph_training, std::shared_ptr final_sage_nodes = nullptr); - int FillSlotFeature(uint64_t* d_walk, size_t key_num); - int GetPathNum() { return total_row_; } - void ResetPathNum() { total_row_ = 0; } - int GetGraphBatchsize() { return conf_.batch_size; }; + int FillSlotFeature(uint64_t* d_walk, size_t key_num, int tensor_pair_idx); + int FillFloatFeature(uint64_t* d_walk, size_t key_num, int tensor_pair_idx); + int GetPathNum() { return total_row_[0]; } + void ResetPathNum() { total_row_[0] = 0; } + int GetGraphBatchsize() { return conf_.batch_size; } void SetNewBatchsize(int batch_num) { if (!conf_.gpu_graph_training) { - conf_.batch_size = (total_row_ + batch_num - 1) / batch_num; + conf_.batch_size = (total_row_[0] + batch_num - 1) / batch_num; } else { return; } } - bool GetSageMode() { - return conf_.sage_mode; - } + bool GetSageMode() { return conf_.sage_mode; } void ResetEpochFinish() { epoch_finish_ = false; } void reset_pass_end() { pass_end_ = 0; } void ClearSampleState(); @@ -972,31 +977,22 @@ class GraphDataGenerator { // h_device_keys_.push_back(device_keys); } - std::vector> SampleNeighbors( - int64_t* uniq_nodes, - int len, - int sample_size, - std::vector& edges_split_num, // NOLINT - int64_t* neighbor_len); - std::shared_ptr GenerateSampleGraph( - uint64_t* node_ids, - int len, - int* uniq_len, - std::shared_ptr& inverse); // NOLINT - std::shared_ptr GetNodeDegree(uint64_t* node_ids, int len); std::vector& GetHostVec() { return host_vec_; } bool get_epoch_finish() { return epoch_finish_; } int get_pass_end() { return pass_end_; } void clear_gpu_mem(); - int multi_node_sync_sample(int flag, const ncclRedOp_t& op); int dynamic_adjust_batch_num_for_sage(); protected: + bool DoWalkForInfer(); + void DoSageForInfer(); + bool DoWalkForTrain(); + void DoSageForTrain(); + HashTable* table_; GraphDataGeneratorConfig conf_; - size_t infer_cursor_; - size_t jump_rows_; - int edge_to_id_len_; + std::vector infer_cursor_; + std::vector jump_rows_; int64_t* id_tensor_ptr_; int* index_tensor_ptr_; int64_t* show_tensor_ptr_; @@ -1008,33 +1004,27 @@ class GraphDataGenerator { cudaStream_t sample_stream_; paddle::platform::Place place_; std::vector feed_vec_; + std::vector* feed_info_; // adapt for float feature std::vector offset_; - std::shared_ptr d_prefix_sum_; - std::vector> d_device_keys_; - std::shared_ptr d_train_metapath_keys_; + std::vector>> d_device_keys_; + std::vector> d_train_metapath_keys_; - std::shared_ptr d_walk_; - std::shared_ptr d_walk_ntype_; + std::vector> d_walk_; + std::vector> d_walk_ntype_; std::shared_ptr d_feature_list_; std::shared_ptr d_feature_; - std::shared_ptr d_random_row_; - std::shared_ptr d_random_row_col_shift_; + std::vector> d_random_row_; + std::vector> d_random_row_col_shift_; std::shared_ptr d_uniq_node_num_; std::shared_ptr d_slot_feature_num_map_; std::shared_ptr d_actual_slot_id_map_; std::shared_ptr d_fea_offset_map_; - std::vector> d_sampleidx2rows_; - int cur_sampleidx2row_; - // record the keys to call graph_neighbor_sample - std::shared_ptr d_sample_keys_; - int sample_keys_len_; - - std::shared_ptr d_pair_label_buf_; - std::shared_ptr d_ins_buf_; + std::vector> d_pair_label_buf_; + std::vector> d_ins_buf_; std::shared_ptr d_feature_size_list_buf_; std::shared_ptr d_feature_size_prefixsum_buf_; - std::shared_ptr d_pair_num_; + std::vector> d_pair_num_; std::shared_ptr d_slot_tensor_ptr_; std::shared_ptr d_slot_lod_tensor_ptr_; std::vector> edge_type_graph_; @@ -1051,29 +1041,27 @@ class GraphDataGenerator { int sage_batch_count_; int sage_batch_num_; - int ins_buf_pair_len_; + std::vector ins_buf_pair_len_; int id_offset_of_feed_vec_; // size of a d_walk buf - size_t buf_size_; int repeat_time_; - BufState buf_state_; + std::vector buf_state_; + int float_slot_num_ = 0; // float slot num + int uint_slot_num_ = 0; // uint slot num std::vector h_slot_feature_num_map_; int fea_num_per_node_; - int shuffle_seed_; - std::vector samples_; + std::vector shuffle_seed_; bool epoch_finish_; int pass_end_ = 0; std::vector host_vec_; - std::vector h_device_keys_len_; - uint64_t h_train_metapath_keys_len_; + std::vector> h_device_keys_len_; + std::vector h_train_metapath_keys_len_; uint64_t copy_unique_len_; - int total_row_; - size_t infer_node_start_; - size_t infer_node_end_; - std::set infer_node_type_index_set_; + std::vector total_row_; + std::vector infer_node_start_; + std::vector infer_node_end_; std::string infer_node_type_; - bool is_multi_node_; phi::DenseTensor multi_node_sync_stat_; }; @@ -1277,7 +1265,6 @@ class DataFeed { // The data read by DataFeed will be stored here std::vector feed_vec_; - phi::DenseTensor* rank_offset_; // the batch size defined by user diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index a3745de929a911..c05f996e8b218d 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -310,6 +310,7 @@ class HogwildWorker : public CPUWorkerBase { std::vector need_copy_vars_; std::vector shard_dump_params_; std::vector shard_dump_fields_; + std::multiset free_param_vars_; bool sharding_mode_ = false; }; diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 4a94725983470d..3885184fd1351b 100755 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -48,6 +48,7 @@ message ShardingConfig { optional bool _dp_as_optimizer_sharding = 13 [ default = false ]; optional int32 stage = 14 [ default = 1 ]; optional bool enable_tuning = 15 [ default = false ]; // incubate for auto parallel + optional bool use_calc_stream = 16 [default = false]; } message HybridConfig { diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h index 2bb616448c0bb5..5c9da941b0d5d8 100644 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -94,6 +94,7 @@ class HeterContext { int multi_mf_dim_ = 0; void* sub_graph_feas = NULL; + void* sub_graph_float_feas = NULL; uint32_t shard_num_ = 37; uint16_t pass_id_ = 0; uint64_t size() { diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h index 23b4a70cc334f9..c130937a9b11d2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h +++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h @@ -533,6 +533,84 @@ struct GpuPsCommGraphFea { } }; // end of struct GpuPsCommGraphFea +struct GpuPsCommGraphFloatFea { + uint64_t *node_list; // only locate on host side, the list of node id + float* feature_list; // locate on both side + uint8_t *slot_id_list; // locate on both side + GpuPsFeaInfo + *fea_info_list; // only locate on host side, the list of fea_info + uint64_t feature_size, node_size, feature_capacity; + + // the size of feature array and graph_node_list array + GpuPsCommGraphFloatFea() + : node_list(NULL), + feature_list(NULL), + slot_id_list(NULL), + fea_info_list(NULL), + feature_size(0), + node_size(0) {} + GpuPsCommGraphFloatFea(uint64_t *node_list_, + float *feature_list_, + uint8_t *slot_id_list_, + GpuPsFeaInfo *fea_info_list_, + uint64_t feature_size_, + uint64_t node_size_) + : node_list(node_list_), + feature_list(feature_list_), + slot_id_list(slot_id_list_), + fea_info_list(fea_info_list_), + feature_size(feature_size_), + node_size(node_size_) {} + void init_on_cpu(uint64_t feature_size, + uint64_t node_size, + uint32_t slot_num) { + PADDLE_ENFORCE_LE( + slot_num, + 255, + platform::errors::InvalidArgument( + "The number of slot_num should not be greater than 255 " + ", but the slot_num is %d ", + slot_num)); + this->feature_size = feature_size; + this->node_size = node_size; + this->node_list = new uint64_t[node_size]; + this->feature_list = new float[feature_size]; + this->slot_id_list = new uint8_t[feature_size]; + this->fea_info_list = new GpuPsFeaInfo[node_size]; + } + void release_on_cpu() { +#define DEL_PTR_ARRAY(p) \ + if (p != nullptr) { \ + delete[] p; \ + p = nullptr; \ + } + DEL_PTR_ARRAY(node_list); + DEL_PTR_ARRAY(feature_list); + DEL_PTR_ARRAY(slot_id_list); + DEL_PTR_ARRAY(fea_info_list); + } + void display_on_cpu() const { + VLOG(1) << "feature_size = " << feature_size; + VLOG(1) << "node_size = " << node_size; + for (uint64_t i = 0; i < feature_size; i++) { + VLOG(1) << "feature_list[" << i << "] = " << feature_list[i]; + } + for (uint64_t i = 0; i < node_size; i++) { + VLOG(1) << "node_id[" << node_list[i] + << "] feature_size = " << fea_info_list[i].feature_size; + std::string str; + uint32_t offset = fea_info_list[i].feature_offset; + for (uint64_t j = 0; j < fea_info_list[i].feature_size; j++) { + if (j > 0) str += ","; + str += std::to_string(slot_id_list[j + offset]); + str += ":"; + str += std::to_string(feature_list[j + offset]); + } + VLOG(1) << str; + } + } +}; // end of struct GpuPsCommGraphFloatFea + } // end of namespace framework } // end of namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index e22c5a0cec0bd6..71e2e72bf23ace 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -37,7 +37,7 @@ class GpuPsGraphTable public: inline int get_table_offset(int gpu_id, GraphTableType type, int idx) const { int type_id = type; - return gpu_id * (graph_table_num_ + feature_table_num_) + + return gpu_id * (graph_table_num_ + feature_table_num_ + float_feature_table_num_) + type_id * graph_table_num_ + idx; } inline int get_graph_list_offset(int gpu_id, int edge_idx) const { @@ -46,9 +46,12 @@ class GpuPsGraphTable inline int get_graph_fea_list_offset(int gpu_id) const { return gpu_id * feature_table_num_; } + inline int get_graph_float_fea_list_offset(int gpu_id) const { + return gpu_id * float_feature_table_num_; + } GpuPsGraphTable(std::shared_ptr resource, - int graph_table_num) + int graph_table_num, int slot_num_for_pull_feature = 0, int float_slot_num = 0) : HeterComm( 0, resource) { load_factor_ = FLAGS_gpugraph_hbm_table_load_factor; @@ -58,11 +61,15 @@ class GpuPsGraphTable rw_lock.reset(new pthread_rwlock_t()); this->graph_table_num_ = graph_table_num; this->feature_table_num_ = 1; + if (float_slot_num > 0) { + VLOG(0) << "float_feature_table_num set to 1"; + this->float_feature_table_num_ = 1; + } gpu_num = resource_->total_device(); memset(global_device_map, -1, sizeof(global_device_map)); tables_ = std::vector( - gpu_num * (graph_table_num_ + feature_table_num_), NULL); + gpu_num * (graph_table_num_ + feature_table_num_ + float_feature_table_num_), NULL); for (int i = 0; i < gpu_num; i++) { global_device_map[resource_->dev_id(i)] = i; for (int j = 0; j < graph_table_num_; j++) { @@ -71,6 +78,9 @@ class GpuPsGraphTable for (int j = 0; j < feature_table_num_; j++) { gpu_graph_fea_list_.push_back(GpuPsCommGraphFea()); } + for (int j = 0; j < float_feature_table_num_; j++) { + gpu_graph_float_fea_list_.push_back(GpuPsCommGraphFloatFea()); + } } cpu_table_status = -1; device_mutex_.resize(gpu_num); @@ -86,9 +96,11 @@ class GpuPsGraphTable } void build_graph_on_single_gpu(const GpuPsCommGraph &g, int gpu_id, int idx); void build_graph_fea_on_single_gpu(const GpuPsCommGraphFea &g, int gpu_id); + void build_graph_float_fea_on_single_gpu(const GpuPsCommGraphFloatFea &g, int gpu_id); void clear_graph_info(int gpu_id, int index); void clear_graph_info(int index); void reset_feature_info(int gpu_id, size_t capacity, size_t feature_size); + void reset_float_feature_info(int gpu_id, size_t capacity, size_t feature_size); void clear_feature_info(int gpu_id, int index); void clear_feature_info(int index); void build_graph_from_cpu(const std::vector &cpu_node_list, @@ -193,6 +205,14 @@ class GpuPsGraphTable std::shared_ptr &size_list_prefix_sum, std::shared_ptr &feature_list, // NOLINT std::shared_ptr &slot_list); // NOLINT + int get_float_feature_info_of_nodes( + int gpu_id, + uint64_t *d_nodes, + int node_num, + uint32_t *size_list, + uint32_t *size_list_prefix_sum, + std::shared_ptr &feature_list, // NOLINT + std::shared_ptr &slot_list); // NOLINT int get_feature_info_of_nodes_normal( int gpu_id, @@ -234,6 +254,15 @@ class GpuPsGraphTable uint32_t *actual_feature_size, uint64_t *feature_list, uint8_t *slot_list); + void move_float_result_to_source_gpu(int start_index, + int gpu_num, + int *h_left, + int *h_right, + int *fea_left, + uint32_t* fea_num_list, + uint32_t* actual_feature_size, + float* feature_list, + uint8_t *slot_list); void move_degree_to_source_gpu( int gpu_id, int gpu_num, int *h_left, int *h_right, int *node_degree); void move_result_to_source_gpu_all_edge_type(int gpu_id, @@ -257,9 +286,10 @@ class GpuPsGraphTable } int gpu_num; - int graph_table_num_, feature_table_num_; + int graph_table_num_, feature_table_num_{0}, float_feature_table_num_{0}; std::vector gpu_graph_list_; std::vector gpu_graph_fea_list_; + std::vector gpu_graph_float_fea_list_; int global_device_map[32]; const int parallel_sample_size = 1; const int dim_y = 256; diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu index 834db992a53bb4..8d16f6822a50cc 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu @@ -96,6 +96,15 @@ __global__ void get_features_size(GpuPsFeaInfo* fea_info_array, } } +__global__ void get_float_features_size(GpuPsFeaInfo* fea_info_array, + uint32_t* feature_size, + int n) { + int idx = blockIdx.x * blockDim.y + threadIdx.y; + if (idx < n) { + feature_size[idx] = fea_info_array[idx].feature_size; + } +} + __global__ void get_features_kernel(GpuPsCommGraphFea graph, GpuPsFeaInfo* fea_info_array, uint32_t* fea_size_prefix_sum, @@ -117,6 +126,27 @@ __global__ void get_features_kernel(GpuPsCommGraphFea graph, } } +__global__ void get_float_features_kernel(GpuPsCommGraphFloatFea graph, + GpuPsFeaInfo* fea_info_array, + uint32_t* fea_size_prefix_sum, + float* feature_array, + uint8_t* slot_array, + int n) { + int idx = blockIdx.x * blockDim.y + threadIdx.y; + if (idx < n) { + uint32_t feature_size = fea_info_array[idx].feature_size; + if (feature_size == 0) { + return; + } + uint32_t src_offset = fea_info_array[idx].feature_offset; + uint32_t dst_offset = fea_size_prefix_sum[idx]; + for (uint32_t j = 0; j < feature_size; ++j) { + feature_array[dst_offset + j] = graph.feature_list[src_offset + j]; + slot_array[dst_offset + j] = graph.slot_id_list[src_offset + j]; + } + } +} + __global__ void get_features_kernel(GpuPsCommGraphFea graph, GpuPsFeaInfo* fea_info_array, int* actual_size, @@ -932,6 +962,7 @@ void GpuPsGraphTable::display_sample_res(void* key, } } +// 用模板参数 void GpuPsGraphTable::move_result_to_source_gpu(int start_index, int gpu_num, int* h_left, @@ -989,6 +1020,68 @@ void GpuPsGraphTable::move_result_to_source_gpu(int start_index, } } +void GpuPsGraphTable::move_float_result_to_source_gpu(int start_index, + int gpu_num, + int* h_left, + int* h_right, + int* fea_left, + uint32_t* fea_num_list, + uint32_t* actual_feature_size, + float* feature_list, + uint8_t* slot_list) { + int shard_len[gpu_num]; // NOLINT + for (int i = 0; i < gpu_num; i++) { + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + shard_len[i] = h_right[i] - h_left[i] + 1; + int cur_step = static_cast(path_[start_index][i].nodes_.size() - 1); + for (int j = cur_step; j > 0; j--) { + auto& dst_node = path_[start_index][i].nodes_[j - 1]; + auto& src_node = path_[start_index][i].nodes_[j]; + MemcpyPeerAsync(dst_node.val_storage, + src_node.val_storage, + dst_node.val_bytes_len, + src_node.out_stream); + if (src_node.sync) { + CUDA_CHECK(cudaStreamSynchronize(src_node.out_stream)); + } + } + auto& node = path_[start_index][i].nodes_.front(); + if (fea_num_list[i] > 0) { + MemcpyPeerAsync(reinterpret_cast(feature_list + fea_left[i]), + node.val_storage + + sizeof(uint32_t) * (shard_len[i] + shard_len[i] % 2), + sizeof(float) * fea_num_list[i], + node.out_stream); + MemcpyPeerAsync(reinterpret_cast(slot_list + fea_left[i]), + node.val_storage + + sizeof(uint32_t) * (shard_len[i] + shard_len[i] % 2) + + sizeof(float) * fea_num_list[i], + sizeof(uint8_t) * fea_num_list[i], + node.out_stream); + } + if (shard_len[i] > 0) { + MemcpyPeerAsync(reinterpret_cast(actual_feature_size + h_left[i]), + node.val_storage, + sizeof(uint32_t) * shard_len[i], + node.out_stream); + // MemcpyPeerAsync(reinterpret_cast(actual_slot_size + h_left[i]), + // node.val_storage + + // sizeof(uint32_t) * (shard_len[i] + shard_len[i] % 2), + // sizeof(uint32_t) * shard_len[i], + // node.out_stream); + } + } + for (int i = 0; i < gpu_num; ++i) { + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + auto& node = path_[start_index][i].nodes_.front(); + CUDA_CHECK(cudaStreamSynchronize(node.out_stream)); + } +} + void GpuPsGraphTable::move_result_to_source_gpu(int start_index, int gpu_num, int sample_size, @@ -1168,7 +1261,7 @@ __global__ void fill_size(uint32_t* d_actual_size_list, d_actual_size_list[idx[i]] = d_shard_size_list[i]; } } - +// 搞成模板 __global__ void fill_feature_and_slot(uint64_t* dst_feature_list, uint8_t* dst_slot_list, uint32_t* dst_size_prefix_sum_list, @@ -1189,6 +1282,25 @@ __global__ void fill_feature_and_slot(uint64_t* dst_feature_list, } } +__global__ void fill_float_feature_and_slot(float* dst_feature_list, + uint8_t* dst_slot_list, + uint32_t* dst_size_prefix_sum_list, + float* src_feature_list, + uint8_t* src_slot_list, + uint32_t* src_size_prefix_sum_list, + uint32_t* src_size_list, + int* idx, + int len) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + uint32_t dst_index = dst_size_prefix_sum_list[idx[i]]; + uint32_t src_index = src_size_prefix_sum_list[i]; + for (uint32_t j = 0; j < src_size_list[i]; j++) { + dst_feature_list[dst_index + j] = src_feature_list[src_index + j]; + dst_slot_list[dst_index + j] = src_slot_list[src_index + j]; + } + } +} __global__ void fill_vari_feature_and_slot(uint64_t* dst_feature_list, uint8_t* dst_slot_list, @@ -1210,6 +1322,8 @@ __global__ void fill_vari_feature_and_slot(uint64_t* dst_feature_list, } } + + /* TODO: how to optimize it to eliminate the for loop @@ -1338,6 +1452,7 @@ __global__ void node_query_example(GpuPsCommGraph graph, void GpuPsGraphTable::clear_feature_info(int gpu_id) { int idx = 0; + // slot fea int offset = get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, idx); if (offset < tables_.size()) { delete tables_[offset]; @@ -1356,6 +1471,29 @@ void GpuPsGraphTable::clear_feature_info(int gpu_id) { graph.slot_id_list = NULL; } graph.feature_capacity = 0; + + if (float_feature_table_num_ > 0) { + // float fea + idx = 1; + int float_offset = get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, idx); + if (float_offset < tables_.size()) { + delete tables_[float_offset]; + tables_[float_offset] = NULL; + } + + int graph_float_fea_idx = get_graph_float_fea_list_offset(gpu_id); + auto& float_graph = gpu_graph_float_fea_list_[graph_float_fea_idx]; + if (float_graph.feature_list != NULL) { + cudaFree(float_graph.feature_list); + float_graph.feature_list = NULL; + } + + if (float_graph.slot_id_list != NULL) { + cudaFree(float_graph.slot_id_list); + float_graph.slot_id_list = NULL; + } + float_graph.feature_capacity = 0; + } } void GpuPsGraphTable::reset_feature_info(int gpu_id, @@ -1394,6 +1532,46 @@ void GpuPsGraphTable::reset_feature_info(int gpu_id, } } + +void GpuPsGraphTable::reset_float_feature_info(int gpu_id, + size_t capacity, + size_t feature_size) { + int idx = 1; + auto stream = get_local_stream(gpu_id); + int offset = + get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, idx); + if (offset < tables_.size()) { + delete tables_[offset]; + tables_[offset] = new Table(capacity, stream); + } + int graph_float_fea_idx = get_graph_float_fea_list_offset(gpu_id); + auto& graph = gpu_graph_float_fea_list_[graph_float_fea_idx]; + + graph.node_list = NULL; + + if (graph.feature_list == NULL) { + CUDA_CHECK(cudaMalloc((void**)&graph.feature_list, + feature_size * sizeof(float))); + CUDA_CHECK(cudaMalloc((void**)&graph.slot_id_list, + ALIGN_INT64(feature_size * sizeof(uint8_t)))); + graph.feature_capacity = feature_size; + } else if (graph.feature_capacity < feature_size) { + cudaFree(graph.feature_list); + cudaFree(graph.slot_id_list); + CUDA_CHECK(cudaMalloc((void**)&graph.feature_list, + feature_size * sizeof(float))); + CUDA_CHECK(cudaMalloc((void**)&graph.slot_id_list, + ALIGN_INT64(feature_size * sizeof(uint8_t)))); + graph.feature_capacity = feature_size; + } else { + CUDA_CHECK(cudaMemsetAsync( + graph.feature_list, 0, feature_size * sizeof(float), stream)); + CUDA_CHECK(cudaMemsetAsync( + graph.slot_id_list, 0, feature_size * sizeof(uint8_t), stream)); + cudaStreamSynchronize(stream); + } +} + void GpuPsGraphTable::clear_graph_info(int gpu_id, int idx) { if (idx >= graph_table_num_) return; int offset = get_table_offset(gpu_id, GraphTableType::EDGE_TABLE, idx); @@ -1426,8 +1604,8 @@ void GpuPsGraphTable::build_graph_fea_on_single_gpu(const GpuPsCommGraphFea& g, int gpu_id) { platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); size_t capacity = std::max((uint64_t)1, g.node_size) / load_factor_; + int ntype_id = 0; // slot feature reset_feature_info(gpu_id, capacity, g.feature_size); - int ntype_id = 0; int offset = get_graph_fea_list_offset(gpu_id); int table_offset = get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, ntype_id); @@ -1467,6 +1645,51 @@ void GpuPsGraphTable::build_graph_fea_on_single_gpu(const GpuPsCommGraphFea& g, << gpu_graph_fea_list_[offset].feature_size; } +void GpuPsGraphTable::build_graph_float_fea_on_single_gpu(const GpuPsCommGraphFloatFea& g, + int gpu_id) { + platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); + size_t capacity = std::max((uint64_t)1, g.node_size) / load_factor_; + int ntype_id = 1; // float feature + reset_float_feature_info(gpu_id, capacity, g.feature_size); + int offset = get_graph_float_fea_list_offset(gpu_id); + int table_offset = + get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, ntype_id); + if (g.node_size > 0) { + build_ps(gpu_id, + g.node_list, + reinterpret_cast(g.fea_info_list), + g.node_size, + HBMPS_MAX_BUFF, + 8, + table_offset); + gpu_graph_float_fea_list_[offset].node_size = g.node_size; + } else { + build_ps(gpu_id, NULL, NULL, 0, HBMPS_MAX_BUFF, 8, table_offset); + gpu_graph_float_fea_list_[offset].node_size = 0; + } + if (g.feature_size) { + auto stream = get_local_stream(gpu_id); + CUDA_CHECK(cudaMemcpyAsync(gpu_graph_float_fea_list_[offset].feature_list, + g.feature_list, + g.feature_size * sizeof(float), + cudaMemcpyHostToDevice, + stream)); + CUDA_CHECK(cudaMemcpyAsync(gpu_graph_float_fea_list_[offset].slot_id_list, + g.slot_id_list, + g.feature_size * sizeof(uint8_t), + cudaMemcpyHostToDevice, + stream)); + cudaStreamSynchronize(stream); + + gpu_graph_float_fea_list_[offset].feature_size = g.feature_size; + } else { + gpu_graph_float_fea_list_[offset].feature_size = 0; + } + VLOG(0) << "gpu node_float_feature info card :" << gpu_id << " ,node_size is " + << gpu_graph_float_fea_list_[offset].node_size << ", feature_size is " + << gpu_graph_float_fea_list_[offset].feature_size; +} + std::vector> GpuPsGraphTable::get_edge_type_graph(int gpu_id, int edge_type_len) { int total_gpu = resource_->total_device(); @@ -3140,6 +3363,7 @@ int GpuPsGraphTable::get_feature_info_of_nodes_normal( heter_comm_kernel_->fill_shard_key( d_shard_keys_ptr, d_nodes, d_idx_ptr, node_num, stream); + // slot feature std::vector d_fea_info(total_gpu, NULL); std::vector d_fea_size(total_gpu, NULL); std::vector d_fea_size_prefix_sum(total_gpu, NULL); @@ -3424,6 +3648,340 @@ int GpuPsGraphTable::get_feature_info_of_nodes_normal( return all_fea_num; } +int GpuPsGraphTable::get_float_feature_info_of_nodes( + int gpu_id, + uint64_t* d_nodes, + int node_num, + uint32_t* size_list, + uint32_t* size_list_prefix_sum, + std::shared_ptr& feature_list, + std::shared_ptr& slot_list) { + if (node_num == 0) { + return 0; + } + platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id)); + platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); + int total_gpu = resource_->total_device(); + auto stream = resource_->local_stream(gpu_id, 0); + + auto d_left = + memory::Alloc(place, + total_gpu * sizeof(int), + phi::Stream(reinterpret_cast(stream))); + auto d_right = + memory::Alloc(place, + total_gpu * sizeof(int), + phi::Stream(reinterpret_cast(stream))); + int* d_left_ptr = reinterpret_cast(d_left->ptr()); + int* d_right_ptr = reinterpret_cast(d_right->ptr()); + + CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream)); + auto d_idx = + memory::Alloc(place, + node_num * sizeof(int), + phi::Stream(reinterpret_cast(stream))); + int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); + + auto d_shard_keys = + memory::Alloc(place, + node_num * sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); + uint64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); + split_idx_to_shard( + d_nodes, d_idx_ptr, node_num, d_left_ptr, d_right_ptr, gpu_id, stream); + + heter_comm_kernel_->fill_shard_key( + d_shard_keys_ptr, d_nodes, d_idx_ptr, node_num, stream); + + // slot feature + std::vector d_fea_info(total_gpu, NULL); + std::vector d_fea_size(total_gpu, NULL); + std::vector d_fea_size_prefix_sum(total_gpu, NULL); + std::vector fea_num_list(total_gpu, 0); + std::vector fea_left(total_gpu, -1); + + int h_left[total_gpu]; // NOLINT + CUDA_CHECK(cudaMemcpyAsync(h_left, + d_left_ptr, + total_gpu * sizeof(int), + cudaMemcpyDeviceToHost, + stream)); + int h_right[total_gpu]; // NOLINT + CUDA_CHECK(cudaMemcpyAsync(h_right, + d_right_ptr, + total_gpu * sizeof(int), + cudaMemcpyDeviceToHost, + stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + device_mutex_[gpu_id]->lock(); + int shard_len[total_gpu]; // NOLINT + void* d_temp_storage[total_gpu]; + std::vector temp_storage_bytes(total_gpu, 0); + + for (int i = 0; i < total_gpu; ++i) { + shard_len[i] = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; + d_temp_storage[i] = NULL; + if (h_left[i] == -1) { + continue; + } + create_storage(gpu_id, i, shard_len[i] * sizeof(uint64_t), 0); + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + auto& node = path_[gpu_id][i].nodes_.back(); + create_tmp_storage( + d_fea_info[i], gpu_id, i, shard_len[i] * sizeof(uint64_t)); + CUDA_CHECK(cudaMemsetAsync( + d_fea_info[i], 0, shard_len[i] * sizeof(uint64_t), node.in_stream)); + create_tmp_storage( + d_fea_size[i], gpu_id, i, shard_len[i] * sizeof(uint32_t)); + + create_tmp_storage(d_fea_size_prefix_sum[i], + gpu_id, + i, + (shard_len[i] + 1) * sizeof(uint32_t)); + CUDA_CHECK(cub::DeviceScan::InclusiveSum( + NULL, + temp_storage_bytes[i], + reinterpret_cast(d_fea_size[i]), + reinterpret_cast(d_fea_size_prefix_sum[i] + 1), + shard_len[i], + resource_->remote_stream(i, gpu_id))); + } + + for (int i = 0; i < total_gpu; ++i) { + if (h_left[i] == -1) { + continue; + } + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + CUDA_CHECK(cudaStreamSynchronize(resource_->remote_stream( + i, gpu_id))); // wait for calc temp_storage_bytes + create_tmp_storage(d_temp_storage[i], gpu_id, i, temp_storage_bytes[i]); + } + walk_to_dest(gpu_id, + total_gpu, + h_left, + h_right, + reinterpret_cast(d_shard_keys_ptr), + NULL); + + // no sync so 8 card can parallel execute + for (int i = 0; i < total_gpu; ++i) { + if (h_left[i] == -1) { + continue; + } + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + auto& node = path_[gpu_id][i].nodes_.back(); + // If not found, val is -1. + int table_offset = get_table_offset(i, GraphTableType::FEATURE_TABLE, 1); + // CUDA_CHECK(cudaStreamSynchronize( + // node.in_stream)); // wait for walk_to_dest and memset + tables_[table_offset]->get(reinterpret_cast(node.key_storage), + reinterpret_cast(d_fea_info[i]), + static_cast(h_right[i] - h_left[i] + 1), + resource_->remote_stream(i, gpu_id)); + dim3 grid((shard_len[i] - 1) / dim_y + 1); + dim3 block(1, dim_y); + + get_float_features_size<<remote_stream(i, gpu_id)>>>( + reinterpret_cast(d_fea_info[i]), + reinterpret_cast(d_fea_size[i]), + shard_len[i]); + + CUDA_CHECK(cudaMemsetAsync(d_fea_size_prefix_sum[i], + 0, + sizeof(uint32_t), + resource_->remote_stream(i, gpu_id))); + CUDA_CHECK(cub::DeviceScan::InclusiveSum( + d_temp_storage[i], + temp_storage_bytes[i], + reinterpret_cast(d_fea_size[i]), + reinterpret_cast(d_fea_size_prefix_sum[i]) + 1, + shard_len[i], + resource_->remote_stream(i, gpu_id))); + } + + // wait for fea_num_list + for (int i = 0; i < total_gpu; ++i) { + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + if (h_left[i] == -1) { + continue; + } + auto& node = path_[gpu_id][i].nodes_.back(); + CUDA_CHECK(cudaMemcpyAsync( + &fea_num_list[i], + reinterpret_cast(d_fea_size_prefix_sum[i]) + shard_len[i], + sizeof(uint32_t), + cudaMemcpyDeviceToHost, + resource_->remote_stream(i, gpu_id))); + + CUDA_CHECK(cudaStreamSynchronize( + resource_->remote_stream(i, gpu_id))); // wait for fea_num_list + + // 拿到每张卡上所有key的所有feature size + create_storage(gpu_id, + i, + 0, + (shard_len[i] + shard_len[i] % 2) * sizeof(uint32_t) + + fea_num_list[i] * sizeof(float) + + fea_num_list[i] * sizeof(uint8_t)); + uint32_t* actual_size_array = reinterpret_cast(node.val_storage); + CUDA_CHECK(cudaMemcpyAsync(actual_size_array, + d_fea_size[i], + sizeof(uint32_t) * shard_len[i], + cudaMemcpyDeviceToDevice, + resource_->remote_stream(i, gpu_id))); + + int offset = get_graph_float_fea_list_offset(i); + auto& graph = gpu_graph_float_fea_list_[offset]; + + float* feature_array = reinterpret_cast( + actual_size_array + shard_len[i] + shard_len[i] % 2); + + uint8_t* slot_array = + reinterpret_cast(feature_array + fea_num_list[i]); + + dim3 grid((shard_len[i] - 1) / dim_y + 1); + dim3 block(1, dim_y); + get_float_features_kernel<<remote_stream(i, gpu_id)>>>( + graph, + reinterpret_cast(d_fea_info[i]), + reinterpret_cast(d_fea_size_prefix_sum[i]), + feature_array, + slot_array, + shard_len[i]); + } + + for (int i = 0; i < total_gpu; ++i) { + if (h_left[i] == -1) { + continue; + } + CUDA_CHECK(cudaStreamSynchronize(resource_->remote_stream(i, gpu_id))); + } + + uint32_t all_fea_num = 0; + for (int i = 0; i < total_gpu; ++i) { + fea_left[i] = all_fea_num; + all_fea_num += fea_num_list[i]; + } + auto feature_list_tmp = + memory::Alloc(place, + all_fea_num * sizeof(float), + phi::Stream(reinterpret_cast(stream))); + float* d_feature_list_ptr = + reinterpret_cast(feature_list_tmp->ptr()); + auto slot_list_tmp = + memory::Alloc(place, + all_fea_num * sizeof(uint8_t), + phi::Stream(reinterpret_cast(stream))); + uint8_t* d_slot_list_ptr = reinterpret_cast(slot_list_tmp->ptr()); + + auto size_list_tmp = + memory::Alloc(place, + node_num * sizeof(uint32_t), + phi::Stream(reinterpret_cast(stream))); + uint32_t* d_size_list_ptr = reinterpret_cast(size_list_tmp->ptr()); + + move_float_result_to_source_gpu(gpu_id, + total_gpu, + h_left, + h_right, + fea_left.data(), + fea_num_list.data(), + d_size_list_ptr, + d_feature_list_ptr, + d_slot_list_ptr); + + for (int i = 0; i < total_gpu; ++i) { + if (shard_len[i] == 0) { + continue; + } + destroy_storage(gpu_id, i); + if (d_fea_info[i] != NULL) { + destroy_tmp_storage(d_fea_info[i], gpu_id, i); + } + if (d_fea_size[i] != NULL) { + destroy_tmp_storage(d_fea_size[i], gpu_id, i); + } + if (d_fea_size_prefix_sum[i] != NULL) { + destroy_tmp_storage(d_fea_size_prefix_sum[i], gpu_id, i); + } + if (d_temp_storage[i] != NULL) { + destroy_tmp_storage(d_temp_storage[i], gpu_id, i); + } + } + + d_fea_info.clear(); + d_fea_size.clear(); + d_fea_size_prefix_sum.clear(); + device_mutex_[gpu_id]->unlock(); + feature_list = + memory::Alloc(place, + all_fea_num * sizeof(float), + phi::Stream(reinterpret_cast(stream))); + + float* d_res_feature_list_ptr = + reinterpret_cast(feature_list->ptr()); + + slot_list = + memory::Alloc(place, + all_fea_num * sizeof(uint8_t), + phi::Stream(reinterpret_cast(stream))); + + uint8_t* d_res_slot_list_ptr = reinterpret_cast(slot_list->ptr()); + + int grid_size = (node_num - 1) / block_size_ + 1; + fill_size<<>>( + size_list, d_size_list_ptr, d_idx_ptr, node_num); + size_t storage_bytes = 0; + auto src_fea_size_prefix_sum = + memory::Alloc(place, + node_num * sizeof(uint32_t), + phi::Stream(reinterpret_cast(stream))); + + uint32_t* src_fea_size_prefix_sum_ptr = + reinterpret_cast(src_fea_size_prefix_sum->ptr()); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cub::DeviceScan::ExclusiveSum( + NULL, storage_bytes, size_list, size_list_prefix_sum, node_num, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + auto d_temp_storage_tmp = + memory::Alloc(place, + storage_bytes, + phi::Stream(reinterpret_cast(stream))); + CUDA_CHECK(cub::DeviceScan::ExclusiveSum(d_temp_storage_tmp->ptr(), + storage_bytes, + size_list, + size_list_prefix_sum, + node_num, + stream)); + + CUDA_CHECK(cub::DeviceScan::ExclusiveSum(d_temp_storage_tmp->ptr(), + storage_bytes, + d_size_list_ptr, + src_fea_size_prefix_sum_ptr, + node_num, + stream)); + fill_float_feature_and_slot<<>>( + d_res_feature_list_ptr, + d_res_slot_list_ptr, + size_list_prefix_sum, + d_feature_list_ptr, + d_slot_list_ptr, + src_fea_size_prefix_sum_ptr, + d_size_list_ptr, + d_idx_ptr, + node_num); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + return all_fea_num; +} + + int GpuPsGraphTable::get_feature_of_nodes(int gpu_id, uint64_t* d_nodes, uint64_t* d_feature, diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu index 96115cc0b93ba8..a6240e00e31e4c 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu @@ -33,8 +33,8 @@ void GraphGpuWrapper::set_device(std::vector ids) { } } -void GraphGpuWrapper::init_conf(const std::string &first_node_type, - const std::string &meta_path, +void GraphGpuWrapper::init_conf(const std::string &first_node_type_str, + const std::string &meta_path_str, const std::string &excluded_train_pair, const std::string &pair_label) { static std::mutex mutex; @@ -45,40 +45,70 @@ void GraphGpuWrapper::init_conf(const std::string &first_node_type, } VLOG(2) << "init path config"; conf_initialized_ = true; - auto node_types = - paddle::string::split_string(first_node_type, ";"); - VLOG(2) << "node_types: " << first_node_type; - for (auto &type : node_types) { - auto iter = node_to_id.find(type); - PADDLE_ENFORCE_NE( - iter, - node_to_id.end(), - platform::errors::NotFound("(%s) is not found in node_to_id.", type)); - VLOG(2) << "node_to_id[" << type << "] = " << iter->second; - first_node_type_.push_back(iter->second); + + std::vector first_node_type_vec; + if (first_node_type_str[0] == '[') { + assert(first_node_type_str[first_node_type_str.size() -1] == ']'); + std::string tmp_first_node_type_str(first_node_type_str, 1, first_node_type_str.size() - 2); + auto tmp_first_node_types = paddle::string::split_string(tmp_first_node_type_str, ","); + first_node_type_vec.assign(tmp_first_node_types.begin(), tmp_first_node_types.end()); + } else { + first_node_type_vec.push_back(first_node_type_str); } - meta_path_.resize(first_node_type_.size()); - auto meta_paths = paddle::string::split_string(meta_path, ";"); - - for (size_t i = 0; i < meta_paths.size(); i++) { - auto path = meta_paths[i]; - auto edges = paddle::string::split_string(path, "-"); - for (auto &edge : edges) { - auto iter = edge_to_id.find(edge); - PADDLE_ENFORCE_NE(iter, - edge_to_id.end(), - platform::errors::NotFound( - "(%s) is not found in edge_to_id.", edge)); - VLOG(2) << "edge_to_id[" << edge << "] = " << iter->second; - meta_path_[i].push_back(iter->second); - if (edge_to_node_map_.find(iter->second) == edge_to_node_map_.end()) { - auto nodes = get_ntype_from_etype(edge); - uint64_t src_node_id = node_to_id.find(nodes[0])->second; - uint64_t dst_node_id = node_to_id.find(nodes[1])->second; - edge_to_node_map_[iter->second] = src_node_id << 32 | dst_node_id; - } - } + tensor_pair_num_ = first_node_type_vec.size(); + first_node_type_.resize(tensor_pair_num_); + for (int tensor_pair_idx = 0; tensor_pair_idx < tensor_pair_num_; ++tensor_pair_idx) { + auto &first_node_type = first_node_type_vec[tensor_pair_idx]; + auto node_types = + paddle::string::split_string(first_node_type, ";"); + VLOG(2) << "node_types: " << first_node_type; + for (auto &type : node_types) { + auto iter = node_to_id.find(type); + PADDLE_ENFORCE_NE( + iter, + node_to_id.end(), + platform::errors::NotFound("(%s) is not found in node_to_id.", type)); + VLOG(2) << "node_to_id[" << type << "] = " << iter->second; + first_node_type_[tensor_pair_idx].push_back(iter->second); + } // end for (auto &type : node_types) + } // end for (int tensor_pair_idx = 0; tensor_pair_idx < tensor_pair_num_; + + std::vector meta_path_vec; + if (meta_path_str[0] == '[') { + assert(meta_path_str[meta_path_str.size() -1] == ']'); + std::string tmp_meta_path(meta_path_str, 1, meta_path_str.size() - 2); + auto tmp_meta_paths = paddle::string::split_string(tmp_meta_path, ","); + meta_path_vec.assign(tmp_meta_paths.begin(), tmp_meta_paths.end()); + } else { + meta_path_vec.push_back(meta_path_str); } + assert(tensor_pair_num_ == meta_path_vec.size()); + meta_path_.resize(tensor_pair_num_); + for (int tensor_pair_idx = 0; tensor_pair_idx < tensor_pair_num_; ++tensor_pair_idx) { + auto &meta_path = meta_path_vec[tensor_pair_idx]; + meta_path_[tensor_pair_idx].resize(first_node_type_[tensor_pair_idx].size()); + auto meta_paths = paddle::string::split_string(meta_path, ";"); + + for (size_t i = 0; i < meta_paths.size(); i++) { + auto path = meta_paths[i]; + auto edges = paddle::string::split_string(path, "-"); + for (auto &edge : edges) { + auto iter = edge_to_id.find(edge); + PADDLE_ENFORCE_NE(iter, + edge_to_id.end(), + platform::errors::NotFound( + "(%s) is not found in edge_to_id.", edge)); + VLOG(2) << "edge_to_id[" << edge << "] = " << iter->second; + meta_path_[tensor_pair_idx][i].push_back(iter->second); + if (edge_to_node_map_.find(iter->second) == edge_to_node_map_.end()) { + auto nodes = get_ntype_from_etype(edge); + uint64_t src_node_id = node_to_id.find(nodes[0])->second; + uint64_t dst_node_id = node_to_id.find(nodes[1])->second; + edge_to_node_map_[iter->second] = src_node_id << 32 | dst_node_id; + } + } // end for (auto &edge : edges) { + } // end for (size_t i = 0; i < meta_paths.size(); i++) { + } // end for (int tensor_pair_idx = 0; tensor_pair_idx < tensor_pair_num_; auto paths = paddle::string::split_string(excluded_train_pair, ";"); @@ -135,28 +165,37 @@ void GraphGpuWrapper::init_conf(const std::string &first_node_type, max_dev_id = device_id_mapping[i]; } } - finish_node_type_.resize(max_dev_id + 1); - node_type_start_.resize(max_dev_id + 1); + + finish_node_type_.resize(tensor_pair_num_); + node_type_start_.resize(tensor_pair_num_); global_infer_node_type_start_.resize(max_dev_id + 1); - for (size_t i = 0; i < device_id_mapping.size(); i++) { - int dev_id = device_id_mapping[i]; - auto &node_type_start = node_type_start_[i]; - auto &infer_node_type_start = global_infer_node_type_start_[i]; - auto &finish_node_type = finish_node_type_[i]; - finish_node_type.clear(); - - for (size_t idx = 0; idx < node_to_id.size(); idx++) { - infer_node_type_start[idx] = 0; - } - for (auto &type : node_types) { - auto iter = node_to_id.find(type); - node_type_start[iter->second] = 0; - infer_node_type_start[iter->second] = 0; + infer_cursor_.resize(tensor_pair_num_); + for (int tensor_pair_idx = 0; tensor_pair_idx < tensor_pair_num_; ++tensor_pair_idx) { + finish_node_type_[tensor_pair_idx].resize(max_dev_id + 1); + node_type_start_[tensor_pair_idx].resize(max_dev_id + 1); + auto &first_node_type = first_node_type_vec[tensor_pair_idx]; + auto node_types = + paddle::string::split_string(first_node_type, ";"); + for (size_t i = 0; i < device_id_mapping.size(); i++) { + int dev_id = device_id_mapping[i]; + auto &node_type_start = node_type_start_[tensor_pair_idx][i]; + auto &infer_node_type_start = global_infer_node_type_start_[i]; + auto &finish_node_type = finish_node_type_[tensor_pair_idx][i]; + finish_node_type.clear(); + + for (size_t idx = 0; idx < node_to_id.size(); idx++) { + infer_node_type_start[idx] = 0; + } + for (auto &type : node_types) { + auto iter = node_to_id.find(type); + node_type_start[iter->second] = 0; + infer_node_type_start[iter->second] = 0; + } + infer_cursor_[tensor_pair_idx].push_back(0); + cursor_.push_back(0); } - infer_cursor_.push_back(0); - cursor_.push_back(0); - } - } + } // end for (int tensor_pair_idx = 0; tensor_pair_idx < tensor_pair_num_; + } // end static std::mutex mutex; } void GraphGpuWrapper::init_type_keys( @@ -455,6 +494,11 @@ void GraphGpuWrapper::set_feature_separator(std::string ch) { } } +void GraphGpuWrapper::set_feature_info(int slot_num_for_pull_feature, int float_slot_num) { + this->slot_num_for_pull_feature_ = slot_num_for_pull_feature; + this->float_slot_num_ = float_slot_num; +} + void GraphGpuWrapper::set_slot_feature_separator(std::string ch) { slot_feature_separator_ = ch; if (graph_table != nullptr) { @@ -740,7 +784,7 @@ void GraphGpuWrapper::init_service() { #endif size_t gpu_num = device_id_mapping.size(); - GpuPsGraphTable *g = new GpuPsGraphTable(resource, id_to_edge.size()); + GpuPsGraphTable *g = new GpuPsGraphTable(resource, id_to_edge.size(), slot_num_for_pull_feature_, float_slot_num_); g->init_cpu_table(table_proto, gpu_num); g->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_, rank_id_); g->cpu_graph_table_->set_feature_separator(feature_separator_); @@ -795,7 +839,8 @@ void GraphGpuWrapper::upload_batch(int table_type, // feature table void GraphGpuWrapper::upload_batch(int table_type, int slice_num, - int slot_num) { + int slot_num, + int float_slot_num) { if (table_type == GraphTableType::FEATURE_TABLE && (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode:: MEM_EMB_FEATURE_AND_GPU_GRAPH || @@ -812,6 +857,7 @@ void GraphGpuWrapper::upload_batch(int table_type, std::vector> tasks; for (int i = 0; i < slice_num; i++) { tasks.push_back(upload_task_pool->enqueue([&, i, this]() -> int { + // build slot feature VLOG(0) << "begin make_gpu_ps_graph_fea, node_ids[" << i << "]_size[" << node_ids[i].size() << "]"; GpuPsCommGraphFea sub_graph = @@ -822,6 +868,19 @@ void GraphGpuWrapper::upload_batch(int table_type, g->build_graph_fea_on_single_gpu(sub_graph, i); sub_graph.release_on_cpu(); VLOG(0) << "sub graph fea on gpu " << i << " is built"; + if (float_slot_num > 0) { + // build float feature + VLOG(0) << "begin make_gpu_ps_graph_float_fea, node_ids[" << i << "]_size[" + << node_ids[i].size() << "]"; + GpuPsCommGraphFloatFea float_sub_graph = + g->cpu_graph_table_->make_gpu_ps_graph_float_fea(i, node_ids[i], float_slot_num); + // sub_graph.display_on_cpu(); + VLOG(0) << "begin build_graph_float_fea_on_single_gpu, node_ids[" << i + << "]_size[" << node_ids[i].size() << "]"; + g->build_graph_float_fea_on_single_gpu(float_sub_graph, i); + float_sub_graph.release_on_cpu(); + VLOG(0) << "float sub graph fea on gpu " << i << " is built"; + } return 0; })); } @@ -848,6 +907,25 @@ std::vector GraphGpuWrapper::get_sub_graph_fea( return sub_graph_feas; } +// get sub_graph_float_fea +std::vector GraphGpuWrapper::get_sub_graph_float_fea( + std::vector> &node_ids, int float_slot_num) { + if (float_slot_num == 0) return {}; + GpuPsGraphTable *g = reinterpret_cast(graph_table); + std::vector> tasks; + std::vector sub_graph_float_feas(node_ids.size()); + for (int i = 0; i < node_ids.size(); i++) { + tasks.push_back(upload_task_pool->enqueue([&, i, this]() -> int { + GpuPsGraphTable *g = reinterpret_cast(graph_table); + sub_graph_float_feas[i] = + g->cpu_graph_table_->make_gpu_ps_graph_float_fea(i, node_ids[i], float_slot_num); + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + return sub_graph_float_feas; +} + // build_gpu_graph_fea void GraphGpuWrapper::build_gpu_graph_fea(GpuPsCommGraphFea &sub_graph_fea, int i) { @@ -858,6 +936,16 @@ void GraphGpuWrapper::build_gpu_graph_fea(GpuPsCommGraphFea &sub_graph_fea, return; } +// build_gpu_graph_float_fea +void GraphGpuWrapper::build_gpu_graph_float_fea(GpuPsCommGraphFloatFea &sub_graph_float_fea, + int i) { + GpuPsGraphTable *g = reinterpret_cast(graph_table); + g->build_graph_float_fea_on_single_gpu(sub_graph_float_fea, i); + sub_graph_float_fea.release_on_cpu(); + VLOG(1) << "sub graph float fea on gpu " << i << " is built"; + return; +} + NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample_v3( NeighborSampleQuery q, bool cpu_switch, bool compress, bool weighted) { @@ -924,6 +1012,28 @@ int GraphGpuWrapper::get_feature_info_of_nodes( slot_list); } +int GraphGpuWrapper::get_float_feature_info_of_nodes( + int gpu_id, + uint64_t *d_nodes, + int node_num, + uint32_t *size_list, + uint32_t *size_list_prefix_sum, + std::shared_ptr &feature_list, + std::shared_ptr &slot_list) { + platform::CUDADeviceGuard guard(gpu_id); + PADDLE_ENFORCE_NOT_NULL(graph_table, + paddle::platform::errors::InvalidArgument( + "graph_table should not be null")); + return reinterpret_cast(graph_table) + ->get_float_feature_info_of_nodes(gpu_id, + d_nodes, + node_num, + size_list, + size_list_prefix_sum, + feature_list, + slot_list); +} + int GraphGpuWrapper::get_feature_of_nodes(int gpu_id, uint64_t *d_walk, uint64_t *d_offset, diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h index 08e88dee14852f..5086095e26fa60 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h @@ -70,10 +70,13 @@ class GraphGpuWrapper { void upload_batch(int table_type, int slice_num, const std::string& edge_type); - void upload_batch(int table_type, int slice_num, int slot_num); + void upload_batch(int table_type, int slice_num, int slot_num, int float_slot_num); std::vector get_sub_graph_fea( std::vector>& node_ids, int slot_num); // NOLINT + std::vector get_sub_graph_float_fea( + std::vector>& node_ids, int float_slot_num); // NOLINT void build_gpu_graph_fea(GpuPsCommGraphFea& sub_graph_fea, int i); // NOLINT + void build_gpu_graph_float_fea(GpuPsCommGraphFloatFea& sub_graph_float_fea, int i); // NOLINT void add_table_feat_conf(std::string table_name, std::string feat_name, std::string feat_dtype, @@ -165,6 +168,7 @@ class GraphGpuWrapper { std::vector> get_edge_type_graph( int gpu_id, int edge_type_len); std::vector slot_feature_num_map() const; + void set_feature_info(int slot_num_for_pull_feature, int float_slot_num); void set_feature_separator(std::string ch); void set_slot_feature_separator(std::string ch); void set_infer_mode(bool infer_mode); @@ -183,7 +187,14 @@ class GraphGpuWrapper { std::shared_ptr& size_list_prefix_sum, std::shared_ptr& feature_list, // NOLINT std::shared_ptr& slot_list); // NOLINT - + int get_float_feature_info_of_nodes( + int gpu_id, + uint64_t *d_nodes, + int node_num, + uint32_t *size_list, + uint32_t *size_list_prefix_sum, + std::shared_ptr &feature_list, // NOLINT + std::shared_ptr &slot_list); // NOLINT void init_metapath(std::string cur_metapath, int cur_metapath_index, int cur_metapath_len); @@ -213,21 +224,24 @@ class GraphGpuWrapper { int search_level = 1; void* graph_table; int upload_num = 8; + int slot_num_for_pull_feature_ = 0; + int float_slot_num_ = 0; std::shared_ptr<::ThreadPool> upload_task_pool; std::string feature_separator_ = std::string(" "); bool conf_initialized_ = false; bool type_keys_initialized_ = false; - std::vector first_node_type_; + std::vector> first_node_type_; std::vector excluded_train_pair_; std::vector pair_label_conf_; - std::vector> meta_path_; + std::vector>> meta_path_; - std::vector> finish_node_type_; - std::vector> node_type_start_; + std::vector>> finish_node_type_; + std::vector>> node_type_start_; std::vector cur_metapath_start_; std::vector> global_infer_node_type_start_; - std::vector infer_cursor_; + std::vector> infer_cursor_; std::vector cursor_; + int tensor_pair_num_; std::vector>> d_graph_all_type_total_keys_; diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index 0e0e50babffea0..b50705bf033bb7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -17,7 +17,6 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_ps/hashtable.h" #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" - namespace paddle { namespace framework { @@ -460,7 +459,6 @@ template void HashTable::get( // HashTable::get( // const uint64_t* d_keys, char* d_vals, size_t len, cudaStream_t // stream); - template void HashTable::insert( const uint64_t* d_keys, const float* d_vals, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 11c3c2676a539c..bffde654fa56ce 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -361,7 +361,6 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task, add_key_to_gputask(gpu_task); } - void PSGPUWrapper::add_slot_feature(std::shared_ptr gpu_task) { platform::Timer timeline; platform::Timer time_stage; @@ -434,6 +433,7 @@ void PSGPUWrapper::add_slot_feature(std::shared_ptr gpu_task) { threads.clear(); time_stage.Pause(); divide_nodeid_cost = time_stage.ElapsedSec(); +if (slot_num_for_pull_feature_ > 0) { #if defined(PADDLE_WITH_PSCORE) && defined(PADDLE_WITH_GPU_GRAPH) gpu_task->sub_graph_feas = reinterpret_cast(new std::vector); @@ -660,12 +660,32 @@ void PSGPUWrapper::add_slot_feature(std::shared_ptr gpu_task) { << " get_feature_id_cost " << get_feature_id_cost << " add_feature_to_set_cost " << add_feature_to_set_cost << " add_feature_to_key_cost " << add_feature_to_key_cost; + } +#if defined(PADDLE_WITH_PSCORE) && defined(PADDLE_WITH_GPU_GRAPH) + if (float_slot_num_ > 0) { + if (FLAGS_gpugraph_storage_mode == + paddle::framework::GpuGraphStorageMode:: + MEM_EMB_FEATURE_AND_GPU_GRAPH || + FLAGS_gpugraph_storage_mode == + paddle::framework::GpuGraphStorageMode:: + SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH) { + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + gpu_task->sub_graph_float_feas = + reinterpret_cast(new std::vector); + std::vector& sub_graph_float_feas = + *((std::vector*)gpu_task->sub_graph_float_feas); + if (float_slot_num_ > 0) { + sub_graph_float_feas = gpu_graph_ptr->get_sub_graph_float_fea(node_ids, float_slot_num_); + } + } + } +#endif } void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { platform::Timer timeline; #if defined(PADDLE_WITH_PSCORE) && defined(PADDLE_WITH_GPU_GRAPH) - if (slot_num_for_pull_feature_ > 0 && + if ((slot_num_for_pull_feature_ > 0 || float_slot_num_ > 0) && FLAGS_gpugraph_storage_mode != paddle::framework::GpuGraphStorageMode::WHOLE_HBM) { add_slot_feature(gpu_task); @@ -1397,6 +1417,18 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { (std::vector*)gpu_task->sub_graph_feas; gpu_graph_ptr->build_gpu_graph_fea((*tmp)[i], i); } + + if (float_slot_num_> 0 && + (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode:: + MEM_EMB_FEATURE_AND_GPU_GRAPH || + FLAGS_gpugraph_storage_mode == + paddle::framework::GpuGraphStorageMode:: + SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH)) { + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + std::vector* float_tmp = + (std::vector*)gpu_task->sub_graph_float_feas; + gpu_graph_ptr->build_gpu_graph_float_fea((*float_tmp)[i], i); + } #endif stagetime.Pause(); auto build_feature_span = stagetime.ElapsedSec(); @@ -1472,6 +1504,11 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { (std::vector*)gpu_task->sub_graph_feas; delete tmp; gpu_task->sub_graph_feas = NULL; + + std::vector* float_tmp = + (std::vector*)gpu_task->sub_graph_float_feas; + delete float_tmp; + gpu_task->sub_graph_float_feas = NULL; } #endif stagetime.Pause(); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 5e940487e2ac0d..df5dbcb82901c5 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -798,9 +798,12 @@ class PSGPUWrapper { slot_vector_ = slot_vector; VLOG(0) << "slot_vector size is " << slot_vector_.size(); } - void SetPullFeatureSlotNum(int slot_num) { - slot_num_for_pull_feature_ = slot_num; - VLOG(0) << "slot_num_for_pull_feature_ is " << slot_num_for_pull_feature_; + void SetPullFeatureSlotNum(int sparse_slot_num, int float_slot_num) { + slot_num_for_pull_feature_ = sparse_slot_num; + float_slot_num_ = float_slot_num; + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + gpu_graph_ptr->set_feature_info(slot_num_for_pull_feature_, float_slot_num_); + VLOG(0) << "slot_num_for_pull_feature_ is " << slot_num_for_pull_feature_ << ", float_slot_num is " << float_slot_num_; } void SetSlotOffsetVector(const std::vector& slot_offset_vector) { slot_offset_vector_ = slot_offset_vector; @@ -942,6 +945,7 @@ class PSGPUWrapper { int multi_mf_dim_{0}; int max_mf_dim_{0}; int slot_num_for_pull_feature_{0}; + int float_slot_num_{0}; size_t val_type_size_{0}; size_t grad_type_size_{0}; size_t pull_type_size_{0}; diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index 7acbfd4a0b50a6..e9f9d42528c638 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -65,11 +65,20 @@ void HogwildWorker::Initialize(const TrainerDesc &desc) { } } int HogwildWorker::IsParameter(const std::string &name, bool full_match) { +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_GPU_GRAPH) + auto gpu_ps = PSGPUWrapper::GetInstance(); + bool last_device = ((thread_num_ - 1) == thread_id_); +#endif if (full_match) { auto it = params2rootid_.find(name); if (it == params2rootid_.end()) { return -1; } +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_GPU_GRAPH) + if (last_device && !gpu_ps->IsKeyForSelfRank(it->second)) { + free_param_vars_.insert(name); + } +#endif if (it->second == nccl_rank_id_) { return 1; } @@ -80,6 +89,11 @@ int HogwildWorker::IsParameter(const std::string &name, bool full_match) { if (strncmp(name.c_str(), it->first.c_str(), it->first.length()) != 0) { continue; } +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_GPU_GRAPH) + if (last_device && !gpu_ps->IsKeyForSelfRank(it->second)) { + free_param_vars_.insert(name); + } +#endif if (it->second == nccl_rank_id_) { return 1; } @@ -377,9 +391,14 @@ void HogwildWorker::CreateThreadScope(const ProgramDesc &program) { int persist_param = 0; int persist_share = 0; int persist_reset = 0; + std::vector del_var_names; for (auto &var : block.AllVars()) { auto name = var->Name(); if (remove_vars_.find(name) != remove_vars_.end()) { + if (free_param_vars_.find(name) != free_param_vars_.end()) { + del_var_names.push_back(name); + VLOG(1) << "remove need delete var name=" << name; + } continue; } all_param_.push_back(name); @@ -421,8 +440,7 @@ void HogwildWorker::CreateThreadScope(const ProgramDesc &program) { continue; } // reset tensor holder - if (persist_param_vars_.find(name) != persist_param_vars_.end() && - platform::is_gpu_place(root_tensor->place())) { + if (persist_param_vars_.find(name) != persist_param_vars_.end()) { phi::DenseTensor cpu_tensor; TensorCopy(*root_tensor, platform::CPUPlace(), &cpu_tensor); root_tensor->MoveMemoryHolder(); @@ -437,6 +455,10 @@ void HogwildWorker::CreateThreadScope(const ProgramDesc &program) { need_copy_vars_.push_back(name); } } else { + if (free_param_vars_.find(name) != free_param_vars_.end()) { + del_var_names.push_back(name); + VLOG(0) << "unpersist need delete var name=" << name; + } // sharding vars auto *ptr = thread_scope_->Var(name); InitializeVariable(ptr, var->GetType()); @@ -450,11 +472,16 @@ void HogwildWorker::CreateThreadScope(const ProgramDesc &program) { InitializeVariable(ptr, var->GetType()); } } + // multi node delete unused vars + if (!del_var_names.empty()) { + root_scope_->EraseVars(del_var_names); + } VLOG(0) << "device id=" << thread_id_ << ", total param count=" << all_param_.size() << ", persist count=" << persist_total << ", param=" << persist_param << ", share=" << persist_share << ", reset=" << persist_reset - << ", need copy param count=" << need_copy_vars_.size(); + << ", need copy param count=" << need_copy_vars_.size() + << ", delete vars count=" << del_var_names.size(); } void HogwildWorker::Finalize() { #ifdef PADDLE_WITH_HETERPS diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc index bdc38674fcf0c8..980349e145c41c 100644 --- a/paddle/fluid/pybind/fleet_py.cc +++ b/paddle/fluid/pybind/fleet_py.cc @@ -395,7 +395,7 @@ void BindGraphGpuWrapper(py::module* m) { py::overload_cast( &GraphGpuWrapper::upload_batch)) .def("upload_batch", - py::overload_cast(&GraphGpuWrapper::upload_batch)) + py::overload_cast(&GraphGpuWrapper::upload_batch)) .def( "get_all_id", py::overload_cast>*>( diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 1abbb9c03441af..421ce8d623a9de 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -1622,7 +1622,7 @@ def _build_groups(self): # sharding if self.sharding_degree > 1: - self.sharding_ring_id = 1 + self.sharding_ring_id = self.mp_ring_id + 1 self.sharding_rank = ( self.global_rank // self.mp_degree ) % self.sharding_degree @@ -2288,51 +2288,36 @@ def _init_comm(self): self.node_nums == node_nums ), "end points not equal node nums" self.current_endpoint = self.global_endpoints[self.role_id] + # mp ring if self.mp_degree > 1: - if node_nums > 1: - self._init_communicator( - self._startup_program, - self.current_endpoint, - self.mp_group_endpoints, - self.role_id, - self.mp_ring_id, - ) - else: - startup_block.append_op( - type='c_comm_init_all', - attrs={'ring_id': self.mp_ring_id}) - + self._init_communicator( + self._startup_program, + self.current_endpoint, + self.mp_group_endpoints, + self.role_id, + self.mp_ring_id, + ) + # sharding ring if self.sharding_degree > 1: - if node_nums > 1: - self._init_communicator( - self._startup_program, - self.current_endpoint, - self.sharding_group_endpoints, - self.role_id, - self.sharding_ring_id, - ) - else: - startup_block.append_op( - type='c_comm_init_all', - attrs={'ring_id': self.sharding_ring_id}) - + self._init_communicator( + self._startup_program, + self.current_endpoint, + self.sharding_group_endpoints, + self.role_id, + self.sharding_ring_id, + ) # pure dp ring if self.dp_degree > 1: - if node_nums > 1: - self._init_communicator( - self._startup_program, - self.current_endpoint, - self.dp_group_endpoints, - self.role_id, - self.dp_ring_id, - ) - else: - startup_block.append_op( - type='c_comm_init_all', - attrs={'ring_id': self.dp_ring_id}) + self._init_communicator( + self._startup_program, + self.current_endpoint, + self.dp_group_endpoints, + self.role_id, + self.dp_ring_id, + ) startup_block._sync_with_cpp() @@ -2355,34 +2340,41 @@ def _init_communicator( ring_id ): nranks = len(endpoints) - other_endpoints = endpoints[:] - other_endpoints.remove(current_endpoint) block = program.global_block() - - nccl_id_var = block.create_var( - name=unique_name.generate('nccl_id'), - persistable=True, - type=core.VarDesc.VarType.RAW, - ) - block.append_op( - type='c_gen_nccl_id', - inputs={}, - outputs={'Out': nccl_id_var}, - attrs={ - 'rank': role_id, - 'endpoint': current_endpoint, - 'other_endpoints': other_endpoints, - self.op_role_key: OpRole.Forward, - }, - ) - block.append_op( - type='c_comm_init_multitrainer', - inputs={'X': nccl_id_var}, - outputs={}, - attrs={ - 'ntrainers': nranks, - 'trainer_id': role_id, - 'ring_id': ring_id, - self.op_role_key: OpRole.Forward, - }, - ) + # init mulit node nccl + if nranks > 1: + other_endpoints = endpoints[:] + other_endpoints.remove(current_endpoint) + + nccl_id_var = block.create_var( + name=unique_name.generate('nccl_id'), + persistable=True, + type=core.VarDesc.VarType.RAW, + ) + block.append_op( + type='c_gen_nccl_id', + inputs={}, + outputs={'Out': nccl_id_var}, + attrs={ + 'rank': role_id, + 'endpoint': current_endpoint, + 'other_endpoints': other_endpoints, + self.op_role_key: OpRole.Forward, + }, + ) + block.append_op( + type='c_comm_init_multitrainer', + inputs={'X': nccl_id_var}, + outputs={}, + attrs={ + 'ntrainers': nranks, + 'trainer_id': role_id, + 'ring_id': ring_id, + self.op_role_key: OpRole.Forward, + }, + ) + else: + block.append_op( + type='c_comm_init_all', + attrs={'ring_id': ring_id} + ) \ No newline at end of file diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index d5ad9f4745c267..aa53a7a94f5fc6 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -1768,7 +1768,7 @@ def _shrink(self, threshold=None): threshold = 0 fleet.util.barrier() - if self.role_maker._is_first_worker(): + if self.context['use_ps_gpu'] or self.role_maker._is_first_worker(): sparses = get_the_one_recv_context( self.context, is_dense=False,