diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 5608b6f6f348b4..5d81e53a695bca 100755 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -227,3 +227,6 @@ endif(WITH_CRYPTO) if(WITH_CUSTOM_DEVICE AND NOT WIN32) add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE) endif() +if(WITH_GPU_GRAPH) + add_definitions(-DPADDLE_WITH_GPU_GRAPH) +endif() diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc index 8ff12265269b28..6a8bf9683bb3b2 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc @@ -143,10 +143,8 @@ int32_t GraphBrpcService::add_graph_node(Table *table, int idx_ = *(int *)(request.params(0).c_str()); size_t node_num = request.params(1).size() / sizeof(int64_t); - int64_t *node_data = (int64_t *)(request.params(1).c_str()); - // size_t node_num = request.params(0).size() / sizeof(int64_t); - // int64_t *node_data = (int64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + uint64_t *node_data = (uint64_t *)(request.params(1).c_str()); + std::vector node_ids(node_data, node_data + node_num); std::vector is_weighted_list; if (request.params_size() == 3) { size_t weight_list_size = request.params(2).size() / sizeof(bool); @@ -177,11 +175,9 @@ int32_t GraphBrpcService::remove_graph_node(Table *table, return 0; } int idx_ = *(int *)(request.params(0).c_str()); - size_t node_num = request.params(1).size() / sizeof(int64_t); - int64_t *node_data = (int64_t *)(request.params(1).c_str()); - // size_t node_num = request.params(0).size() / sizeof(int64_t); - // int64_t *node_data = (int64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(1).size() / sizeof(uint64_t); + uint64_t *node_data = (uint64_t *)(request.params(1).c_str()); + std::vector node_ids(node_data, node_data + node_num); ((GraphTable *)table)->remove_graph_node(idx_, node_ids); return 0; @@ -215,11 +211,6 @@ int32_t GraphBrpcService::Initialize() { &GraphBrpcService::graph_set_node_feat; _service_handler_map[PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER] = &GraphBrpcService::sample_neighbors_across_multi_servers; - // _service_handler_map[PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE] = - // &GraphBrpcService::use_neighbors_sample_cache; - // _service_handler_map[PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG] = - // &GraphBrpcService::load_graph_split_config; - // shard初始化,server启动后才可从env获取到server_list的shard信息 InitializeShardInfo(); return 0; @@ -384,9 +375,6 @@ int32_t GraphBrpcService::pull_graph_list(Table *table, int start = *(int *)(request.params(2).c_str()); int size = *(int *)(request.params(3).c_str()); int step = *(int *)(request.params(4).c_str()); - // int start = *(int *)(request.params(0).c_str()); - // int size = *(int *)(request.params(1).c_str()); - // int step = *(int *)(request.params(2).c_str()); std::unique_ptr buffer; int actual_size; ((GraphTable *)table) @@ -406,14 +394,10 @@ int32_t GraphBrpcService::graph_random_sample_neighbors( return 0; } int idx_ = *(int *)(request.params(0).c_str()); - size_t node_num = request.params(1).size() / sizeof(int64_t); - int64_t *node_data = (int64_t *)(request.params(1).c_str()); - int sample_size = *(int64_t *)(request.params(2).c_str()); + size_t node_num = request.params(1).size() / sizeof(uint64_t); + uint64_t *node_data = (uint64_t *)(request.params(1).c_str()); + int sample_size = *(int *)(request.params(2).c_str()); bool need_weight = *(bool *)(request.params(3).c_str()); - // size_t node_num = request.params(0).size() / sizeof(int64_t); - // int64_t *node_data = (int64_t *)(request.params(0).c_str()); - // int sample_size = *(int64_t *)(request.params(1).c_str()); - // bool need_weight = *(bool *)(request.params(2).c_str()); std::vector> buffers(node_num); std::vector actual_sizes(node_num, 0); ((GraphTable *)table) @@ -433,7 +417,7 @@ int32_t GraphBrpcService::graph_random_sample_nodes( brpc::Controller *cntl) { int type_id = *(int *)(request.params(0).c_str()); int idx_ = *(int *)(request.params(1).c_str()); - size_t size = *(int64_t *)(request.params(2).c_str()); + size_t size = *(uint64_t *)(request.params(2).c_str()); // size_t size = *(int64_t *)(request.params(0).c_str()); std::unique_ptr buffer; int actual_size; @@ -459,11 +443,9 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table, return 0; } int idx_ = *(int *)(request.params(0).c_str()); - size_t node_num = request.params(1).size() / sizeof(int64_t); - int64_t *node_data = (int64_t *)(request.params(1).c_str()); - // size_t node_num = request.params(0).size() / sizeof(int64_t); - // int64_t *node_data = (int64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(1).size() / sizeof(uint64_t); + uint64_t *node_data = (uint64_t *)(request.params(1).c_str()); + std::vector node_ids(node_data, node_data + node_num); std::vector feature_names = paddle::string::split_string(request.params(2), "\t"); @@ -497,22 +479,15 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( } int idx_ = *(int *)(request.params(0).c_str()); - size_t node_num = request.params(1).size() / sizeof(int64_t), + size_t node_num = request.params(1).size() / sizeof(uint64_t), size_of_size_t = sizeof(size_t); - int64_t *node_data = (int64_t *)(request.params(1).c_str()); - int sample_size = *(int64_t *)(request.params(2).c_str()); - bool need_weight = *(int64_t *)(request.params(3).c_str()); - - // size_t node_num = request.params(0).size() / sizeof(int64_t), - // size_of_size_t = sizeof(size_t); - // int64_t *node_data = (int64_t *)(request.params(0).c_str()); - // int sample_size = *(int64_t *)(request.params(1).c_str()); - // bool need_weight = *(int64_t *)(request.params(2).c_str()); - // std::vector res = ((GraphTable - // *)table).filter_out_non_exist_nodes(node_data, sample_size); + uint64_t *node_data = (uint64_t *)(request.params(1).c_str()); + int sample_size = *(int *)(request.params(2).c_str()); + bool need_weight = *(bool *)(request.params(3).c_str()); + std::vector request2server; std::vector server2request(server_size, -1); - std::vector local_id; + std::vector local_id; std::vector local_query_idx; size_t rank = GetRank(); for (int query_idx = 0; query_idx < node_num; ++query_idx) { @@ -535,7 +510,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( std::vector> local_buffers; std::vector local_actual_sizes; std::vector seq; - std::vector> node_id_buckets(request_call_num); + std::vector> node_id_buckets(request_call_num); std::vector> query_idx_buckets(request_call_num); for (int query_idx = 0; query_idx < node_num; ++query_idx) { int server_index = @@ -624,7 +599,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( closure->request(request_idx) ->add_params((char *)node_id_buckets[request_idx].data(), - sizeof(int64_t) * node_num); + sizeof(uint64_t) * node_num); closure->request(request_idx) ->add_params((char *)&sample_size, sizeof(int)); closure->request(request_idx) @@ -661,11 +636,9 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table, } int idx_ = *(int *)(request.params(0).c_str()); - // size_t node_num = request.params(0).size() / sizeof(int64_t); - // int64_t *node_data = (int64_t *)(request.params(0).c_str()); - size_t node_num = request.params(1).size() / sizeof(int64_t); - int64_t *node_data = (int64_t *)(request.params(1).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(1).size() / sizeof(uint64_t); + uint64_t *node_data = (uint64_t *)(request.params(1).c_str()); + std::vector node_ids(node_data, node_data + node_num); // std::vector feature_names = // paddle::string::split_string(request.params(1), "\t"); diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h index 55beb9b3932a62..892873d2294c49 100644 --- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h +++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h @@ -81,14 +81,14 @@ class GraphPyService { graph_proto->set_table_name("cpu_graph_table"); graph_proto->set_use_cache(false); - for (int i = 0; i < id_to_edge.size(); i++) + for (int i = 0; i < (int)id_to_edge.size(); i++) graph_proto->add_edge_types(id_to_edge[i]); - for (int i = 0; i < id_to_feature.size(); i++) { + for (int i = 0; i < (int)id_to_feature.size(); i++) { graph_proto->add_node_types(id_to_feature[i]); auto feat_node = id_to_feature[i]; ::paddle::distributed::GraphFeature* g_f = graph_proto->add_graph_feature(); - for (int x = 0; x < table_feat_conf_feat_name[i].size(); x++) { + for (int x = 0; x < (int)table_feat_conf_feat_name[i].size(); x++) { g_f->add_name(table_feat_conf_feat_name[i][x]); g_f->add_dtype(table_feat_conf_feat_dtype[i][x]); g_f->add_shape(table_feat_conf_feat_shape[i][x]); diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc index 43dee275a3dc69..2d48d3e4e6449f 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.cc +++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc @@ -44,15 +44,86 @@ int32_t GraphTable::Load_to_ssd(const std::string &path, return 0; } +paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea( + std::vector &node_ids, int slot_num) { + std::vector> bags(task_pool_size_); + for (auto x : node_ids) { + int location = x % shard_num % task_pool_size_; + bags[location].push_back(x); + } + std::vector> tasks; + std::vector feature_array[task_pool_size_]; + std::vector slot_id_array[task_pool_size_]; + std::vector + node_fea_array[task_pool_size_]; + for (size_t i = 0; i < bags.size(); i++) { + if (bags[i].size() > 0) { + tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int { + paddle::framework::GpuPsGraphFeaNode x; + std::vector feature_ids; + for (size_t j = 0; j < bags[i].size(); j++) { + // TODO use FEATURE_TABLE instead + Node *v = find_node(1, bags[i][j]); + x.node_id = bags[i][j]; + if (v == NULL) { + x.feature_size = 0; + x.feature_offset = 0; + node_fea_array[i].push_back(x); + } else { + // x <- v + x.feature_offset = feature_array[i].size(); + int total_feature_size = 0; + for (int k = 0; k < slot_num; ++k) { + v->get_feature_ids(k, &feature_ids); + total_feature_size += feature_ids.size(); + if (!feature_ids.empty()) { + feature_array[i].insert(feature_array[i].end(), + feature_ids.begin(), feature_ids.end()); + slot_id_array[i].insert(slot_id_array[i].end(), + feature_ids.size(), k); + } + } + x.feature_size = total_feature_size; + node_fea_array[i].push_back(x); + } + } + return 0; + })); + } + } + for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get(); + paddle::framework::GpuPsCommGraphFea res; + uint64_t tot_len = 0; + for (int i = 0; i < task_pool_size_; i++) { + tot_len += feature_array[i].size(); + } + VLOG(0) << "Loaded feature table on cpu, feature_list_size[" << tot_len + << "] node_ids_size[" << node_ids.size() << "]"; + res.init_on_cpu(tot_len, (unsigned int)node_ids.size(), slot_num); + unsigned int offset = 0, ind = 0; + for (int i = 0; i < task_pool_size_; i++) { + for (int j = 0; j < (int)node_fea_array[i].size(); j++) { + res.node_list[ind] = node_fea_array[i][j]; + res.node_list[ind++].feature_offset += offset; + } + for (size_t j = 0; j < feature_array[i].size(); j++) { + res.feature_list[offset + j] = feature_array[i][j]; + res.slot_id_list[offset + j] = slot_id_array[i][j]; + } + offset += feature_array[i].size(); + } + return res; +} + paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph( - int idx, std::vector ids) { - std::vector> bags(task_pool_size_); + int idx, std::vector ids) { + std::vector> bags(task_pool_size_); for (auto x : ids) { int location = x % shard_num % task_pool_size_; bags[location].push_back(x); } std::vector> tasks; - std::vector edge_array[task_pool_size_]; + std::vector edge_array[task_pool_size_]; std::vector node_array[task_pool_size_]; for (size_t i = 0; i < bags.size(); i++) { if (bags[i].size() > 0) { @@ -69,7 +140,7 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph( x.neighbor_size = v->get_neighbor_size(); x.neighbor_offset = edge_array[i].size(); node_array[i].push_back(x); - for (size_t k = 0; k < x.neighbor_size; k++) { + for (size_t k = 0; k < (size_t)x.neighbor_size; k++) { edge_array[i].push_back(v->get_neighbor_id(k)); } } @@ -84,10 +155,6 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph( for (int i = 0; i < task_pool_size_; i++) { tot_len += edge_array[i].size(); } - // res.neighbor_size = tot_len; - // res.node_size = ids.size(); - // res.neighbor_list = new int64_t[tot_len]; - // res.node_list = new paddle::framework::GpuPsGraphNode[ids.size()]; res.init_on_cpu(tot_len, ids.size()); int64_t offset = 0, ind = 0; for (int i = 0; i < task_pool_size_; i++) { @@ -103,55 +170,34 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph( return res; } -int32_t GraphTable::add_node_to_ssd(int type_id, int idx, int64_t src_id, +int32_t GraphTable::add_node_to_ssd(int type_id, int idx, uint64_t src_id, char *data, int len) { if (_db != NULL) { - char ch[sizeof(int) * 2 + sizeof(int64_t)]; + char ch[sizeof(int) * 2 + sizeof(uint64_t)]; memcpy(ch, &type_id, sizeof(int)); memcpy(ch + sizeof(int), &idx, sizeof(int)); - memcpy(ch + sizeof(int) * 2, &src_id, sizeof(int64_t)); + memcpy(ch + sizeof(int) * 2, &src_id, sizeof(uint64_t)); std::string str; if (_db->get(src_id % shard_num % task_pool_size_, ch, - sizeof(int) * 2 + sizeof(int64_t), str) == 0) { - int64_t *stored_data = ((int64_t *)str.c_str()); - int n = str.size() / sizeof(int64_t); - char *new_data = new char[n * sizeof(int64_t) + len]; - memcpy(new_data, stored_data, n * sizeof(int64_t)); - memcpy(new_data + n * sizeof(int64_t), data, len); + sizeof(int) * 2 + sizeof(uint64_t), str) == 0) { + uint64_t *stored_data = ((uint64_t *)str.c_str()); + int n = str.size() / sizeof(uint64_t); + char *new_data = new char[n * sizeof(uint64_t) + len]; + memcpy(new_data, stored_data, n * sizeof(uint64_t)); + memcpy(new_data + n * sizeof(uint64_t), data, len); _db->put(src_id % shard_num % task_pool_size_, ch, - sizeof(int) * 2 + sizeof(int64_t), (char *)new_data, - n * sizeof(int64_t) + len); + sizeof(int) * 2 + sizeof(uint64_t), (char *)new_data, + n * sizeof(uint64_t) + len); delete[] new_data; } else { _db->put(src_id % shard_num % task_pool_size_, ch, - sizeof(int) * 2 + sizeof(int64_t), (char *)data, len); + sizeof(int) * 2 + sizeof(uint64_t), (char *)data, len); } - // _db->flush(src_id % shard_num % task_pool_size_); - // std::string x; - // if (_db->get(src_id % shard_num % task_pool_size_, ch, sizeof(int64_t) + - // 2 * sizeof(int), x) ==0){ - // VLOG(0)<<"put result"; - // for(int i = 0;i < x.size();i+=8){ - // VLOG(0)<<"get an id "<<*((int64_t *)(x.c_str() + i)); - // } - //} - // if(src_id == 429){ - // str = ""; - // _db->get(src_id % shard_num % task_pool_size_, ch, - // sizeof(int) * 2 + sizeof(int64_t), str); - // int64_t *stored_data = ((int64_t *)str.c_str()); - // int n = str.size() / sizeof(int64_t); - // VLOG(0)<<"429 has "< rng, int &actual_size) { if (_db == NULL) { actual_size = 0; @@ -159,16 +205,16 @@ char *GraphTable::random_sample_neighbor_from_ssd( } std::string str; VLOG(2) << "sample ssd for key " << id; - char ch[sizeof(int) * 2 + sizeof(int64_t)]; + char ch[sizeof(int) * 2 + sizeof(uint64_t)]; memset(ch, 0, sizeof(int)); memcpy(ch + sizeof(int), &idx, sizeof(int)); - memcpy(ch + sizeof(int) * 2, &id, sizeof(int64_t)); + memcpy(ch + sizeof(int) * 2, &id, sizeof(uint64_t)); if (_db->get(id % shard_num % task_pool_size_, ch, - sizeof(int) * 2 + sizeof(int64_t), str) == 0) { - int64_t *data = ((int64_t *)str.c_str()); - int n = str.size() / sizeof(int64_t); + sizeof(int) * 2 + sizeof(uint64_t), str) == 0) { + uint64_t *data = ((uint64_t *)str.c_str()); + int n = str.size() / sizeof(uint64_t); std::unordered_map m; - // std::vector res; + // std::vector res; int sm_size = std::min(n, sample_size); actual_size = sm_size * Node::id_size; char *buff = new char[actual_size]; @@ -192,7 +238,7 @@ char *GraphTable::random_sample_neighbor_from_ssd( // res.push_back(data[pos]); } for (int i = 0; i < actual_size; i += 8) { - VLOG(2) << "sampled an neighbor " << *(int64_t *)&buff[i]; + VLOG(2) << "sampled an neighbor " << *(uint64_t *)&buff[i]; } return buff; } @@ -201,8 +247,8 @@ char *GraphTable::random_sample_neighbor_from_ssd( } int64_t GraphTable::load_graph_to_memory_from_ssd(int idx, - std::vector &ids) { - std::vector> bags(task_pool_size_); + std::vector &ids) { + std::vector> bags(task_pool_size_); for (auto x : ids) { int location = x % shard_num % task_pool_size_; bags[location].push_back(x); @@ -213,17 +259,17 @@ int64_t GraphTable::load_graph_to_memory_from_ssd(int idx, if (bags[i].size() > 0) { tasks.push_back(_shards_task_pool[i]->enqueue([&, i, idx, this]() -> int { - char ch[sizeof(int) * 2 + sizeof(int64_t)]; + char ch[sizeof(int) * 2 + sizeof(uint64_t)]; memset(ch, 0, sizeof(int)); memcpy(ch + sizeof(int), &idx, sizeof(int)); for (size_t k = 0; k < bags[i].size(); k++) { auto v = bags[i][k]; - memcpy(ch + sizeof(int) * 2, &v, sizeof(int64_t)); + memcpy(ch + sizeof(int) * 2, &v, sizeof(uint64_t)); std::string str; - if (_db->get(i, ch, sizeof(int) * 2 + sizeof(int64_t), str) == 0) { + if (_db->get(i, ch, sizeof(int) * 2 + sizeof(uint64_t), str) == 0) { count[i] += (int64_t)str.size(); - for (int j = 0; j < str.size(); j += sizeof(int64_t)) { - int64_t id = *(int64_t *)(str.c_str() + j); + for (int j = 0; j < (int)str.size(); j += sizeof(uint64_t)) { + uint64_t id = *(uint64_t *)(str.c_str() + j); add_comm_edge(idx, v, id); } } @@ -260,7 +306,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) { std::vector weight_cost(part_len, 0); std::vector memory_remaining(part_len, gb_size_by_discount); std::vector score(part_len, 0); - std::unordered_map id_map; + std::unordered_map id_map; std::vector iters; for (int i = 0; i < task_pool_size_; i++) { iters.push_back(_db->get_iterator(i)); @@ -268,7 +314,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) { } int next = 0; while (iters.size()) { - if (next >= iters.size()) { + if (next >= (int)iters.size()) { next = 0; } if (!iters[next]->Valid()) { @@ -284,7 +330,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) { continue; } std::string value = iters[next]->value().ToString(); - std::int64_t i_key = *(int64_t *)(key.c_str() + sizeof(int) * 2); + std::uint64_t i_key = *(uint64_t *)(key.c_str() + sizeof(int) * 2); for (int i = 0; i < part_len; i++) { if (memory_remaining[i] < (int64_t)value.size()) { score[i] = -100000.0; @@ -292,8 +338,8 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) { score[i] = 0; } } - for (int j = 0; j < value.size(); j += sizeof(int64_t)) { - int64_t v = *((int64_t *)(value.c_str() + j)); + for (int j = 0; j < (int)value.size(); j += sizeof(uint64_t)) { + uint64_t v = *((uint64_t *)(value.c_str() + j)); int index = -1; if (id_map.find(v) != id_map.end()) { index = id_map[v]; @@ -413,8 +459,6 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path, auto paths = paddle::string::split_string(path, ";"); int64_t count = 0; std::string sample_type = "random"; - bool is_weighted = false; - int valid_count = 0; for (auto path : paths) { std::ifstream file(path); std::string line; @@ -425,13 +469,13 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path, if (values.size() < 2) continue; auto src_id = std::stoll(values[0]); auto dist_ids = paddle::string::split_string(values[1], ";"); - std::vector dist_data; + std::vector dist_data; for (auto x : dist_ids) { dist_data.push_back(std::stoll(x)); - total_memory_cost += sizeof(int64_t); + total_memory_cost += sizeof(uint64_t); } add_node_to_ssd(0, idx, src_id, (char *)dist_data.data(), - (int)(dist_data.size() * sizeof(int64_t))); + (int)(dist_data.size() * sizeof(uint64_t))); } } VLOG(0) << "total memory cost = " << total_memory_cost << " bytes"; @@ -440,9 +484,6 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path, int32_t GraphTable::dump_edges_to_ssd(int idx) { VLOG(2) << "calling dump edges to ssd"; - const int64_t fixed_size = 10000; - // std::vector edge_array[task_pool_size_]; - std::vector> count(task_pool_size_); std::vector> tasks; auto &shards = edge_shards[idx]; for (size_t i = 0; i < shards.size(); ++i) { @@ -450,15 +491,14 @@ int32_t GraphTable::dump_edges_to_ssd(int idx) { [&, i, this]() -> int64_t { int64_t cost = 0; std::vector &v = shards[i]->get_bucket(); - size_t ind = i % this->task_pool_size_; for (size_t j = 0; j < v.size(); j++) { - std::vector s; - for (int k = 0; k < v[j]->get_neighbor_size(); k++) { + std::vector s; + for (int k = 0; k < (int)v[j]->get_neighbor_size(); k++) { s.push_back(v[j]->get_neighbor_id(k)); } - cost += v[j]->get_neighbor_size() * sizeof(int64_t); + cost += v[j]->get_neighbor_size() * sizeof(uint64_t); add_node_to_ssd(0, idx, v[j]->get_id(), (char *)s.data(), - s.size() * sizeof(int64_t)); + s.size() * sizeof(uint64_t)); } return cost; })); @@ -470,7 +510,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) { VLOG(0) << "make_complementary_graph"; const int64_t fixed_size = byte_size / 8; // std::vector edge_array[task_pool_size_]; - std::vector> count(task_pool_size_); + std::vector> count(task_pool_size_); std::vector> tasks; auto &shards = edge_shards[idx]; for (size_t i = 0; i < shards.size(); ++i) { @@ -480,7 +520,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) { size_t ind = i % this->task_pool_size_; for (size_t j = 0; j < v.size(); j++) { // size_t location = v[j]->get_id(); - for (int k = 0; k < v[j]->get_neighbor_size(); k++) { + for (size_t k = 0; k < v[j]->get_neighbor_size(); k++) { count[ind][v[j]->get_neighbor_id(k)]++; } } @@ -488,9 +528,9 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) { })); } for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); - std::unordered_map final_count; - std::map> count_to_id; - std::vector buffer; + std::unordered_map final_count; + std::map> count_to_id; + std::vector buffer; clear_graph(idx); for (int i = 0; i < task_pool_size_; i++) { @@ -527,6 +567,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) { bucket[i]->build_sampler(sample_type); } } + return 0; } #endif @@ -821,7 +862,7 @@ std::vector GraphShard::get_batch(int start, int end, int step) { size_t GraphShard::get_size() { return bucket.size(); } -int32_t GraphTable::add_comm_edge(int idx, int64_t src_id, int64_t dst_id) { +int32_t GraphTable::add_comm_edge(int idx, uint64_t src_id, uint64_t dst_id) { size_t src_shard_id = src_id % shard_num; if (src_shard_id >= shard_end || src_shard_id < shard_start) { @@ -832,11 +873,11 @@ int32_t GraphTable::add_comm_edge(int idx, int64_t src_id, int64_t dst_id) { edge_shards[idx][index]->add_neighbor(src_id, dst_id, 1.0); return 0; } -int32_t GraphTable::add_graph_node(int idx, std::vector &id_list, +int32_t GraphTable::add_graph_node(int idx, std::vector &id_list, std::vector &is_weight_list) { auto &shards = edge_shards[idx]; size_t node_size = id_list.size(); - std::vector>> batch(task_pool_size_); + std::vector>> batch(task_pool_size_); for (size_t i = 0; i < node_size; i++) { size_t shard_id = id_list[i] % shard_num; if (shard_id >= shard_end || shard_id < shard_start) { @@ -861,9 +902,9 @@ int32_t GraphTable::add_graph_node(int idx, std::vector &id_list, return 0; } -int32_t GraphTable::remove_graph_node(int idx, std::vector &id_list) { +int32_t GraphTable::remove_graph_node(int idx, std::vector &id_list) { size_t node_size = id_list.size(); - std::vector> batch(task_pool_size_); + std::vector> batch(task_pool_size_); for (size_t i = 0; i < node_size; i++) { size_t shard_id = id_list[i] % shard_num; if (shard_id >= shard_end || shard_id < shard_start) continue; @@ -896,7 +937,7 @@ void GraphShard::clear() { GraphShard::~GraphShard() { clear(); } -void GraphShard::delete_node(int64_t id) { +void GraphShard::delete_node(uint64_t id) { auto iter = node_location.find(id); if (iter == node_location.end()) return; int pos = iter->second; @@ -908,7 +949,7 @@ void GraphShard::delete_node(int64_t id) { node_location.erase(id); bucket.pop_back(); } -GraphNode *GraphShard::add_graph_node(int64_t id) { +GraphNode *GraphShard::add_graph_node(uint64_t id) { if (node_location.find(id) == node_location.end()) { node_location[id] = bucket.size(); bucket.push_back(new GraphNode(id)); @@ -924,19 +965,25 @@ GraphNode *GraphShard::add_graph_node(Node *node) { } return (GraphNode *)bucket[node_location[id]]; } -FeatureNode *GraphShard::add_feature_node(int64_t id) { + +FeatureNode *GraphShard::add_feature_node(uint64_t id, bool is_overlap) { if (node_location.find(id) == node_location.end()) { node_location[id] = bucket.size(); bucket.push_back(new FeatureNode(id)); + return (FeatureNode *)bucket[node_location[id]]; } - return (FeatureNode *)bucket[node_location[id]]; + if (is_overlap) { + return (FeatureNode *)bucket[node_location[id]]; + } + + return NULL; } -void GraphShard::add_neighbor(int64_t id, int64_t dst_id, float weight) { +void GraphShard::add_neighbor(uint64_t id, uint64_t dst_id, float weight) { find_node(id)->add_edge(dst_id, weight); } -Node *GraphShard::find_node(int64_t id) { +Node *GraphShard::find_node(uint64_t id) { auto iter = node_location.find(id); return iter == node_location.end() ? nullptr : bucket[iter->second]; } @@ -974,11 +1021,11 @@ int32_t GraphTable::Load(const std::string &path, const std::string ¶m) { int32_t GraphTable::get_nodes_ids_by_ranges( int type_id, int idx, std::vector> ranges, - std::vector &res) { + std::vector &res) { int start = 0, end, index = 0, total_size = 0; res.clear(); auto &shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; - std::vector>> tasks; + std::vector>> tasks; for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) { end = total_size + shards[i]->get_size(); start = total_size; @@ -994,7 +1041,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges( first -= total_size; second -= total_size; tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( - [&shards, this, first, second, i]() -> std::vector { + [&shards, this, first, second, i]() -> std::vector { return shards[i]->get_ids_by_range(first, second); })); } @@ -1011,10 +1058,11 @@ int32_t GraphTable::get_nodes_ids_by_ranges( return 0; } +// TODO opt load all node_types in once reading int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { auto paths = paddle::string::split_string(path, ";"); - int64_t count = 0; - int64_t valid_count = 0; + uint64_t count = 0; + uint64_t valid_count = 0; int idx = 0; if (node_type == "") { VLOG(0) << "node_type not specified, loading edges to " << id_to_feature[0] @@ -1027,53 +1075,47 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { } idx = feature_to_id[node_type]; } - for (auto path : paths) { - std::ifstream file(path); - std::string line; - while (std::getline(file, line)) { - auto values = paddle::string::split_string(line, "\t"); - if (values.size() < 2) continue; - auto id = std::stoull(values[1]); - - size_t shard_id = id % shard_num; - if (shard_id >= shard_end || shard_id < shard_start) { - VLOG(4) << "will not load " << id << " from " << path + + VLOG(0) << "Begin GraphTable::load_nodes() node_type[" << node_type << "]"; + std::vector> tasks; + for (size_t i = 0; i < paths.size(); i++) { + tasks.push_back(load_node_edge_task_pool[i % load_thread_num]->enqueue( + [&, i, idx, this]() -> int { + VLOG(0) << "Begin GraphTable::load_nodes(), path[" << paths[i] << "]"; + std::ifstream file(paths[i]); + std::string line; + uint64_t local_count = 0; + while (std::getline(file, line)) { + auto values = paddle::string::split_string(line, "\t"); + if (values.size() < 2) continue; + if (values[0] != node_type) { + continue; + } + + auto id = std::stoull(values[1]); + size_t shard_id = id % shard_num; + if (shard_id >= shard_end || shard_id < shard_start) { + VLOG(4) << "will not load " << id << " from " << path << ", please check id distribution"; - continue; - } - - if (count % 1000000 == 0) { - VLOG(0) << count << " nodes are loaded from filepath"; - VLOG(0) << line; - } - count++; - - std::string nt = values[0]; - if (nt != node_type) { - continue; - } - - size_t index = shard_id - shard_start; - - // auto node = shards[index]->add_feature_node(id); - auto node = feature_shards[idx][index]->add_feature_node(id); - node->set_feature_size(feat_name[idx].size()); + continue; + } - for (size_t slice = 2; slice < values.size(); slice++) { - auto feat = this->parse_feature(idx, values[slice]); - if (feat.first >= 0) { - node->set_feature(feat.first, feat.second); - } else { - VLOG(4) << "Node feature: " << values[slice] - << " not in feature_map."; + size_t index = shard_id - shard_start; + auto node = feature_shards[idx][index]->add_feature_node(id, false); + if (node != NULL) { + node->set_feature_size(feat_name[idx].size()); + for (size_t slice = 2; slice < values.size(); slice++) { + parse_feature(idx, values[slice], node); + } + local_count++; } } - valid_count++; - } + VLOG(0) << "node_type[" << node_type << "] loads " << local_count << " nodes from filepath->" << paths[i]; + return 0; + })); } - - VLOG(0) << valid_count << "/" << count << " nodes in type " << node_type - << " are loaded successfully in " << path; + for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get(); + VLOG(0) << "successfully load all node_type[" << node_type << "] data"; return 0; } @@ -1089,9 +1131,8 @@ int32_t GraphTable::build_sampler(int idx, std::string sample_type) { int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge, const std::string &edge_type) { #ifdef PADDLE_WITH_HETERPS - // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get()); if (search_level == 2) total_memory_cost = 0; - const int64_t fixed_load_edges = 1000000; + const uint64_t fixed_load_edges = 1000000; #endif int idx = 0; if (edge_type == "") { @@ -1107,104 +1148,125 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge, } auto paths = paddle::string::split_string(path, ";"); - int64_t count = 0; + uint64_t count = 0; std::string sample_type = "random"; bool is_weighted = false; - int valid_count = 0; - for (auto path : paths) { - std::ifstream file(path); - std::string line; - while (std::getline(file, line)) { - auto values = paddle::string::split_string(line, "\t"); - count++; - if (values.size() < 2) continue; - auto src_id = std::stoull(values[0]); - auto dst_id = std::stoull(values[1]); - if (reverse_edge) { - std::swap(src_id, dst_id); - } - float weight = 1; - if (values.size() == 3) { - weight = std::stof(values[2]); - sample_type = "weighted"; - is_weighted = true; - } - - size_t src_shard_id = src_id % shard_num; + + VLOG(0) << "Begin GraphTable::load_edges() edge_type[" << edge_type << "]"; + std::vector> tasks; + for (int i = 0; i < paths.size(); i++) { + tasks.push_back(load_node_edge_task_pool[i % load_thread_num]->enqueue( + [&, i, idx, this]() -> int { + uint64_t local_count = 0; + std::ifstream file(paths[i]); + std::string line; + auto path_split = paddle::string::split_string(paths[i], "/"); + auto part_name_split = paddle::string::split_string(path_split[path_split.size() - 1], "-"); + auto part_num = std::stoull(part_name_split[part_name_split.size() - 1]); + + while (std::getline(file, line)) { + auto values = paddle::string::split_string(line, "\t"); + local_count++; + if (values.size() < 2) continue; + auto src_id = std::stoull(values[0]); + auto dst_id = std::stoull(values[1]); + if (reverse_edge) { + std::swap(src_id, dst_id); + } + size_t src_shard_id = src_id % shard_num; + if (src_shard_id != (part_num % shard_num)) { + continue; + } + + float weight = 1; + if (values.size() == 3) { + weight = std::stof(values[2]); + sample_type = "weighted"; + is_weighted = true; + } - if (src_shard_id >= shard_end || src_shard_id < shard_start) { - VLOG(4) << "will not load " << src_id << " from " << path + if (src_shard_id >= shard_end || src_shard_id < shard_start) { + VLOG(4) << "will not load " << src_id << " from " << path << ", please check id distribution"; - continue; - } - - if (count % 1000000 == 0) { - VLOG(0) << count << " edges are loaded from filepath"; - VLOG(0) << line; + continue; + } + + size_t index = src_shard_id - shard_start; + edge_shards[idx][index]->add_graph_node(src_id)->build_edges(is_weighted); + edge_shards[idx][index]->add_neighbor(src_id, dst_id, weight); } - - size_t index = src_shard_id - shard_start; - edge_shards[idx][index]->add_graph_node(src_id)->build_edges(is_weighted); - edge_shards[idx][index]->add_neighbor(src_id, dst_id, weight); - valid_count++; #ifdef PADDLE_WITH_HETERPS - // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get()); - if (count > fixed_load_edges && search_level == 2) { + if (search_level == 2) { dump_edges_to_ssd(idx); VLOG(0) << "dumping edges to ssd, edge count is reset to 0"; clear_graph(idx); - count = 0; } #endif - } + VLOG(0) << local_count << " edges are loaded from filepath->" << paths[i]; + return 0; + })); } - VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in " - << path; + for (int j = 0; j < (int)tasks.size(); j++) tasks[j].get(); + VLOG(0) << "successfully load all edge_type[" << edge_type << "] data"; -// Build Sampler j -#ifdef PADDLE_WITH_HETERPS - // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get()); - if (search_level == 2) { - if (count > 0) { - dump_edges_to_ssd(idx); - VLOG(0) << "dumping edges to ssd, edge count is reset to 0"; - clear_graph(idx); - count = 0; - } - return 0; - } -#endif +#ifdef PADDLE_WITH_GPU_GRAPH + // To reduce memory overhead, CPU samplers won't be created in gpugraph. + // In order not to affect the sampler function of other scenario, + // this optimization is only performed in load_edges function. + VLOG(0) << "run in gpugraph mode!"; +#else + VLOG(0) << "build sampler ... "; for (auto &shard : edge_shards[idx]) { auto bucket = shard->get_bucket(); for (size_t i = 0; i < bucket.size(); i++) { bucket[i]->build_sampler(sample_type); } } - +#endif return 0; } + +Node *GraphTable::find_node(int type_id, uint64_t id) { + size_t shard_id = id % shard_num; + if (shard_id >= shard_end || shard_id < shard_start) { + return nullptr; + } + Node *node = nullptr; + size_t index = shard_id - shard_start; + auto &search_shards = type_id == 0 ? edge_shards : feature_shards; + for (auto& search_shard: search_shards) { + PADDLE_ENFORCE_NOT_NULL(search_shard[index]); + node = search_shard[index]->find_node(id); + if (node != nullptr) { + break; + } + } + return node; +} -Node *GraphTable::find_node(int type_id, int idx, int64_t id) { +Node *GraphTable::find_node(int type_id, int idx, uint64_t id) { size_t shard_id = id % shard_num; if (shard_id >= shard_end || shard_id < shard_start) { return nullptr; } size_t index = shard_id - shard_start; auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; + PADDLE_ENFORCE_NOT_NULL(search_shards[index]); Node *node = search_shards[index]->find_node(id); return node; } -uint32_t GraphTable::get_thread_pool_index(int64_t node_id) { +uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { return node_id % shard_num % shard_num_per_server % task_pool_size_; } -uint32_t GraphTable::get_thread_pool_index_by_shard_index(int64_t shard_index) { +uint32_t GraphTable::get_thread_pool_index_by_shard_index( + uint64_t shard_index) { return shard_index % shard_num_per_server % task_pool_size_; } int32_t GraphTable::clear_nodes(int type_id, int idx) { auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; - for (int i = 0; i < search_shards.size(); i++) { + for (size_t i = 0; i < search_shards.size(); i++) { search_shards[i]->clear(); } return 0; @@ -1268,16 +1330,16 @@ int32_t GraphTable::random_sample_nodes(int type_id, int idx, int sample_size, } } for (auto &pair : first_half) second_half.push_back(pair); - std::vector res; + std::vector res; get_nodes_ids_by_ranges(type_id, idx, second_half, res); - actual_size = res.size() * sizeof(int64_t); + actual_size = res.size() * sizeof(uint64_t); buffer.reset(new char[actual_size]); char *pointer = buffer.get(); memcpy(pointer, res.data(), actual_size); return 0; } int32_t GraphTable::random_sample_neighbors( - int idx, int64_t *node_ids, int sample_size, + int idx, uint64_t *node_ids, int sample_size, std::vector> &buffers, std::vector &actual_sizes, bool need_weight) { size_t node_num = buffers.size(); @@ -1295,7 +1357,7 @@ int32_t GraphTable::random_sample_neighbors( for (int i = 0; i < (int)seq_id.size(); i++) { if (seq_id[i].size() == 0) continue; tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int { - int64_t node_id; + uint64_t node_id; std::vector> r; LRUResponse response = LRUResponse::blocked; if (use_cache) { @@ -1341,7 +1403,7 @@ int32_t GraphTable::random_sample_neighbors( res.size() * (need_weight ? (Node::id_size + Node::weight_size) : Node::id_size); int offset = 0; - int64_t id; + uint64_t id; float weight; char *buffer_addr = new char[actual_size]; if (response == LRUResponse::ok) { @@ -1376,13 +1438,14 @@ int32_t GraphTable::random_sample_neighbors( return 0; } -int32_t GraphTable::get_node_feat(int idx, const std::vector &node_ids, +int32_t GraphTable::get_node_feat(int idx, + const std::vector &node_ids, const std::vector &feature_names, std::vector> &res) { size_t node_num = node_ids.size(); std::vector> tasks; for (size_t idy = 0; idy < node_num; ++idy) { - int64_t node_id = node_ids[idy]; + uint64_t node_id = node_ids[idy]; tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( [&, idx, idy, node_id]() -> int { Node *node = find_node(1, idx, node_id); @@ -1410,13 +1473,13 @@ int32_t GraphTable::get_node_feat(int idx, const std::vector &node_ids, } int32_t GraphTable::set_node_feat( - int idx, const std::vector &node_ids, + int idx, const std::vector &node_ids, const std::vector &feature_names, const std::vector> &res) { size_t node_num = node_ids.size(); std::vector> tasks; for (size_t idy = 0; idy < node_num; ++idy) { - int64_t node_id = node_ids[idy]; + uint64_t node_id = node_ids[idy]; tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( [&, idx, idy, node_id]() -> int { size_t index = node_id % this->shard_num - this->shard_start; @@ -1439,59 +1502,126 @@ int32_t GraphTable::set_node_feat( return 0; } -std::pair GraphTable::parse_feature( - int idx, std::string feat_str) { +void string_vector_2_string(std::vector::iterator strs_begin, + std::vector::iterator strs_end, char delim, std::string* output) { + size_t i = 0; + for (std::vector::iterator iter = strs_begin; iter != strs_end; ++iter) { + if (i > 0) { + *output += delim; + } + + *output += *iter; + ++i; + } +} + +int GraphTable::parse_feature(int idx, const std::string& feat_str, + FeatureNode* node) { // Return (feat_id, btyes) if name are in this->feat_name, else return (-1, // "") - auto fields = paddle::string::split_string(feat_str, " "); - if (feat_id_map[idx].count(fields[0])) { - // if (this->feat_id_map.count(fields[0])) { - int32_t id = this->feat_id_map[idx][fields[0]]; + std::vector fields = + paddle::string::split_string(feat_str, feature_separator_); + auto it = feat_id_map[idx].find(fields[0]); + if (it != feat_id_map[idx].end()) { + int32_t id = it->second; + std::string* fea_ptr = node->mutable_feature(id); std::string dtype = this->feat_dtype[idx][id]; - std::vector values(fields.begin() + 1, fields.end()); if (dtype == "feasign") { - return std::make_pair( - int32_t(id), paddle::string::join_strings(values, ' ')); + string_vector_2_string(fields.begin() + 1, fields.end(), ' ', fea_ptr); + return 0; } else if (dtype == "string") { - return std::make_pair( - int32_t(id), paddle::string::join_strings(values, ' ')); + string_vector_2_string(fields.begin() + 1, fields.end(), ' ', fea_ptr); + return 0; } else if (dtype == "float32") { - return std::make_pair( - int32_t(id), FeatureNode::parse_value_to_bytes(values)); + FeatureNode::parse_value_to_bytes(fields.begin() + 1, fields.end(), fea_ptr); + return 0; } else if (dtype == "float64") { - return std::make_pair( - int32_t(id), FeatureNode::parse_value_to_bytes(values)); + FeatureNode::parse_value_to_bytes(fields.begin() + 1, fields.end(), fea_ptr); + return 0; } else if (dtype == "int32") { - return std::make_pair( - int32_t(id), FeatureNode::parse_value_to_bytes(values)); + FeatureNode::parse_value_to_bytes(fields.begin() + 1, fields.end(), fea_ptr); + return 0; } else if (dtype == "int64") { - return std::make_pair( - int32_t(id), FeatureNode::parse_value_to_bytes(values)); + FeatureNode::parse_value_to_bytes(fields.begin() + 1, fields.end(), fea_ptr); + return 0; } + } else { + VLOG(2) << "feature_name[" << fields[0] + << "] is not in feat_id_map, ntype_id[" << idx + << "] feat_id_map_size[" << feat_id_map.size() << "]"; } - return std::make_pair(-1, ""); + + return -1; +} + +std::vector> GraphTable::get_all_id(int type_id, int slice_num) { + std::vector> res(slice_num); + auto &search_shards = type_id == 0 ? edge_shards : feature_shards; + std::vector>> tasks; + for (int idx = 0; idx < search_shards.size(); idx++) { + for (int j = 0; j < search_shards[idx].size(); j++) { + tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue( + [&search_shards, idx, j]() -> std::vector { + return search_shards[idx][j]->get_all_id(); + })); + } + } + for (size_t i = 0; i < tasks.size(); ++i) { + tasks[i].wait(); + } + for (size_t i = 0; i < tasks.size(); i++) { + auto ids = tasks[i].get(); + for (auto &id : ids) { + res[(uint64_t)(id) % slice_num].push_back(id); + } + } + return res; } -std::vector> GraphTable::get_all_id(int type_id, int idx, - int slice_num) { - std::vector> res(slice_num); +std::vector> GraphTable::get_all_id(int type_id, int idx, + int slice_num) { + std::vector> res(slice_num); auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; - std::vector>> tasks; + std::vector>> tasks; + VLOG(0) << "begin task, task_pool_size_[" << task_pool_size_ << "]"; for (int i = 0; i < search_shards.size(); i++) { tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( - [&search_shards, i]() -> std::vector { + [&search_shards, i]() -> std::vector { return search_shards[i]->get_all_id(); })); } for (size_t i = 0; i < tasks.size(); ++i) { tasks[i].wait(); } + VLOG(0) << "end task, task_pool_size_[" << task_pool_size_ << "]"; for (size_t i = 0; i < tasks.size(); i++) { auto ids = tasks[i].get(); - for (auto &id : ids) res[(uint64_t)(id) % slice_num].push_back(id); + for (auto &id : ids) res[id % slice_num].push_back(id); } return res; } + +int GraphTable::get_all_feature_ids(int type_id, int idx, int slice_num, + std::vector>* output) { + output->resize(slice_num); + auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; + std::vector>> tasks; + for (int i = 0; i < search_shards.size(); i++) { + tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( + [&search_shards, i]() -> std::vector { + return search_shards[i]->get_all_feature_ids(); + })); + } + for (size_t i = 0; i < tasks.size(); ++i) { + tasks[i].wait(); + } + for (size_t i = 0; i < tasks.size(); i++) { + auto ids = tasks[i].get(); + for (auto &id : ids) (*output)[id % slice_num].push_back(id); + } + return 0; +} + int32_t GraphTable::pull_graph_list(int type_id, int idx, int start, int total_size, std::unique_ptr &buffer, @@ -1542,7 +1672,11 @@ int32_t GraphTable::pull_graph_list(int type_id, int idx, int start, return 0; } -int32_t GraphTable::get_server_index_by_id(int64_t id) { +void GraphTable::set_feature_separator(const std::string &ch) { + feature_separator_ = ch; +} + +int32_t GraphTable::get_server_index_by_id(uint64_t id) { return id % shard_num / shard_num_per_server; } int32_t GraphTable::Initialize(const TableParameter &config, @@ -1617,6 +1751,10 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) { _shards_task_pool[i].reset(new ::ThreadPool(1)); _shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0)); } + load_node_edge_task_pool.resize(load_thread_num); + for (size_t i = 0; i< load_node_edge_task_pool.size(); i++) { + load_node_edge_task_pool[i].reset(new ::ThreadPool(1)); + } auto graph_feature = graph.graph_feature(); auto node_types = graph.node_types(); auto edge_types = graph.edge_types(); diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h index 25bec5276e7293..5692bbfd3b8dd2 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.h +++ b/paddle/fluid/distributed/ps/table/common_graph_table.h @@ -56,33 +56,44 @@ class GraphShard { ~GraphShard(); std::vector &get_bucket() { return bucket; } std::vector get_batch(int start, int end, int step); - std::vector get_ids_by_range(int start, int end) { - std::vector res; + std::vector get_ids_by_range(int start, int end) { + std::vector res; for (int i = start; i < end && i < (int)bucket.size(); i++) { res.push_back(bucket[i]->get_id()); } return res; } - std::vector get_all_id() { - std::vector res; + std::vector get_all_id() { + std::vector res; for (int i = 0; i < (int)bucket.size(); i++) { res.push_back(bucket[i]->get_id()); } return res; } - GraphNode *add_graph_node(int64_t id); + std::vector get_all_feature_ids() { + // TODO by huwei02, dedup + std::vector total_res; + std::set res; + for (int i = 0; i < (int)bucket.size(); i++) { + res.clear(); + bucket[i]->get_feature_ids(&res); + total_res.insert(total_res.end(), res.begin(), res.end()); + } + return total_res; + } + GraphNode *add_graph_node(uint64_t id); GraphNode *add_graph_node(Node *node); - FeatureNode *add_feature_node(int64_t id); - Node *find_node(int64_t id); - void delete_node(int64_t id); + FeatureNode *add_feature_node(uint64_t id, bool is_overlap = true); + Node *find_node(uint64_t id); + void delete_node(uint64_t id); void clear(); - void add_neighbor(int64_t id, int64_t dst_id, float weight); - std::unordered_map &get_node_location() { + void add_neighbor(uint64_t id, uint64_t dst_id, float weight); + std::unordered_map &get_node_location() { return node_location; } private: - std::unordered_map node_location; + std::unordered_map node_location; std::vector bucket; }; @@ -90,10 +101,10 @@ enum LRUResponse { ok = 0, blocked = 1, err = 2 }; struct SampleKey { int idx; - int64_t node_key; + uint64_t node_key; size_t sample_size; bool is_weighted; - SampleKey(int _idx, int64_t _node_key, size_t _sample_size, + SampleKey(int _idx, uint64_t _node_key, size_t _sample_size, bool _is_weighted) { idx = _idx; node_key = _node_key; @@ -455,7 +466,7 @@ class GraphTable : public Table { int step); virtual int32_t random_sample_neighbors( - int idx, int64_t *node_ids, int sample_size, + int idx, uint64_t *node_ids, int sample_size, std::vector> &buffers, std::vector &actual_sizes, bool need_weight); @@ -465,7 +476,7 @@ class GraphTable : public Table { virtual int32_t get_nodes_ids_by_ranges( int type_id, int idx, std::vector> ranges, - std::vector &res); + std::vector &res); virtual int32_t Initialize() { return 0; } virtual int32_t Initialize(const TableParameter &config, const FsClientParameter &fs_config); @@ -475,17 +486,21 @@ class GraphTable : public Table { int32_t load_edges(const std::string &path, bool reverse, const std::string &edge_type); - std::vector> get_all_id(int type, int idx, - int slice_num); + std::vector> get_all_id(int type, int slice_num); + std::vector> get_all_id(int type, int idx, + int slice_num); + int get_all_feature_ids(int type, int idx, + int slice_num, std::vector>* output); int32_t load_nodes(const std::string &path, std::string node_type); - int32_t add_graph_node(int idx, std::vector &id_list, + int32_t add_graph_node(int idx, std::vector &id_list, std::vector &is_weight_list); - int32_t remove_graph_node(int idx, std::vector &id_list); + int32_t remove_graph_node(int idx, std::vector &id_list); - int32_t get_server_index_by_id(int64_t id); - Node *find_node(int type_id, int idx, int64_t id); + int32_t get_server_index_by_id(uint64_t id); + Node *find_node(int type_id, int idx, uint64_t id); + Node *find_node(int type_id, uint64_t id); virtual int32_t Pull(TableContext &context) { return 0; } virtual int32_t Push(TableContext &context) { return 0; } @@ -510,17 +525,17 @@ class GraphTable : public Table { this->server_num = server_num; return 0; } - virtual uint32_t get_thread_pool_index_by_shard_index(int64_t shard_index); - virtual uint32_t get_thread_pool_index(int64_t node_id); - virtual std::pair parse_feature(int idx, - std::string feat_str); + virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index); + virtual uint32_t get_thread_pool_index(uint64_t node_id); + virtual int parse_feature(int idx, const std::string& feat_str, + FeatureNode* node); - virtual int32_t get_node_feat(int idx, const std::vector &node_ids, + virtual int32_t get_node_feat(int idx, const std::vector &node_ids, const std::vector &feature_names, std::vector> &res); virtual int32_t set_node_feat( - int idx, const std::vector &node_ids, + int idx, const std::vector &node_ids, const std::vector &feature_names, const std::vector> &res); @@ -554,20 +569,22 @@ class GraphTable : public Table { virtual void make_partitions(int idx, int64_t gb_size, int device_len); virtual void export_partition_files(int idx, std::string file_path); virtual char *random_sample_neighbor_from_ssd( - int idx, int64_t id, int sample_size, + int idx, uint64_t id, int sample_size, const std::shared_ptr rng, int &actual_size); - virtual int32_t add_node_to_ssd(int type_id, int idx, int64_t src_id, + virtual int32_t add_node_to_ssd(int type_id, int idx, uint64_t src_id, char *data, int len); virtual paddle::framework::GpuPsCommGraph make_gpu_ps_graph( - int idx, std::vector ids); + int idx, std::vector ids); + virtual paddle::framework::GpuPsCommGraphFea make_gpu_ps_graph_fea( + std::vector &node_ids, int slot_num); int32_t Load_to_ssd(const std::string &path, const std::string ¶m); - int64_t load_graph_to_memory_from_ssd(int idx, std::vector &ids); + int64_t load_graph_to_memory_from_ssd(int idx, std::vector &ids); int32_t make_complementary_graph(int idx, int64_t byte_size); int32_t dump_edges_to_ssd(int idx); int32_t get_partition_num(int idx) { return partitions[idx].size(); } - std::vector get_partition(int idx, int index) { - if (idx >= partitions.size() || index >= partitions[idx].size()) - return std::vector(); + std::vector get_partition(int idx, int index) { + if (idx >= (int)partitions.size() || index >= (int)partitions[idx].size()) + return std::vector(); return partitions[idx][index]; } int32_t load_edges_to_ssd(const std::string &path, bool reverse_edge, @@ -576,17 +593,19 @@ class GraphTable : public Table { void set_search_level(int search_level) { this->search_level = search_level; } int search_level; int64_t total_memory_cost; - std::vector>> partitions; + std::vector>> partitions; int next_partition; #endif - virtual int32_t add_comm_edge(int idx, int64_t src_id, int64_t dst_id); + virtual int32_t add_comm_edge(int idx, uint64_t src_id, uint64_t dst_id); virtual int32_t build_sampler(int idx, std::string sample_type = "random"); + void set_feature_separator(const std::string &ch); std::vector> edge_shards, feature_shards; size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num; int task_pool_size_ = 24; + int load_thread_num = 150; const int random_sample_nodes_ranges = 3; - std::vector>> node_weight; + std::vector>> node_weight; std::vector> feat_name; std::vector> feat_dtype; std::vector> feat_shape; @@ -598,9 +617,10 @@ class GraphTable : public Table { std::vector> _shards_task_pool; std::vector> _shards_task_rng_pool; + std::vector> load_node_edge_task_pool; std::shared_ptr> scaled_lru; - std::unordered_set extra_nodes; - std::unordered_map extra_nodes_to_thread_index; + std::unordered_set extra_nodes; + std::unordered_map extra_nodes_to_thread_index; bool use_cache, use_duplicate_nodes; int cache_size_limit; int cache_ttl; @@ -609,9 +629,10 @@ class GraphTable : public Table { #ifdef PADDLE_WITH_HETERPS // paddle::framework::GpuPsGraphTable gpu_graph_table; paddle::distributed::RocksDBHandler *_db; -// std::shared_ptr<::ThreadPool> graph_sample_pool; -// std::shared_ptr graph_sampler; -// REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler) + // std::shared_ptr<::ThreadPool> graph_sample_pool; + // std::shared_ptr graph_sampler; + // REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler) + std::string feature_separator_ = std::string(" "); #endif }; @@ -630,7 +651,7 @@ class CompleteGraphSampler : public GraphSampler { protected: GraphTable *graph_table; std::vector> sample_nodes; - std::vector> sample_neighbors; + std::vector> sample_neighbors; // std::vector sample_res; // std::shared_ptr random; int gpu_num; @@ -649,11 +670,11 @@ class BasicBfsGraphSampler : public GraphSampler { GraphTable *graph_table; // std::vector> sample_nodes; std::vector> sample_nodes; - std::vector> sample_neighbors; + std::vector> sample_neighbors; size_t gpu_num; int init_search_size, node_num_for_each_shard, edge_num_for_each_node; int rounds, interval; - std::vector>> + std::vector>> sample_neighbors_map; }; #endif diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h index c6c594036d4fc9..5f567d0c4b4931 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_node.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h @@ -18,7 +18,12 @@ #include #include #include +#include +#include "glog/logging.h" #include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h" +#include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/core/enforce.h" + namespace paddle { namespace distributed { @@ -29,6 +34,7 @@ class Node { virtual ~Node() {} static int id_size, int_size, weight_size; uint64_t get_id() { return id; } + int64_t get_py_id() { return (int64_t)id; } void set_id(uint64_t id) { this->id = id; } virtual void build_edges(bool is_weighted) {} @@ -45,7 +51,13 @@ class Node { virtual void to_buffer(char *buffer, bool need_feature); virtual void recover_from_buffer(char *buffer); virtual std::string get_feature(int idx) { return std::string(""); } - virtual void set_feature(int idx, std::string str) {} + virtual int get_feature_ids(std::set *res) const { + return 0; + } + virtual int get_feature_ids(int slot_idx, std::vector *res) const { + return 0; + } + virtual void set_feature(int idx, const std::string& str) {} virtual void set_feature_size(int size) {} virtual int get_feature_size() { return 0; } virtual size_t get_neighbor_size() { return 0; } @@ -94,7 +106,51 @@ class FeatureNode : public Node { } } - virtual void set_feature(int idx, std::string str) { + virtual int get_feature_ids(std::set *res) const { + PADDLE_ENFORCE_NOT_NULL(res); + errno = 0; + for (auto& feature_item: feature) { + const char *feat_str = feature_item.c_str(); + auto fields = paddle::string::split_string(feat_str, " "); + char *head_ptr = NULL; + for (auto &field : fields) { + PADDLE_ENFORCE_EQ(field.empty(), false); + uint64_t feasign = strtoull(field.c_str(), &head_ptr, 10); + PADDLE_ENFORCE_EQ(field.c_str() + field.length(), head_ptr); + res->insert(feasign); + } + } + PADDLE_ENFORCE_EQ(errno, 0); + return 0; + } + + virtual int get_feature_ids(int slot_idx, std::vector *res) const { + PADDLE_ENFORCE_NOT_NULL(res); + res->clear(); + errno = 0; + if (slot_idx < (int)this->feature.size()) { + const char *feat_str = this->feature[slot_idx].c_str(); + auto fields = paddle::string::split_string(feat_str, " "); + char *head_ptr = NULL; + for (auto &field : fields) { + PADDLE_ENFORCE_EQ(field.empty(), false); + uint64_t feasign = strtoull(field.c_str(), &head_ptr, 10); + PADDLE_ENFORCE_EQ(field.c_str() + field.length(), head_ptr); + res->push_back(feasign); + } + } + PADDLE_ENFORCE_EQ(errno, 0); + return 0; + } + + virtual std::string* mutable_feature(int idx) { + if (idx >= (int)this->feature.size()) { + this->feature.resize(idx + 1); + } + return &(this->feature[idx]); + } + + virtual void set_feature(int idx, const std::string& str) { if (idx >= (int)this->feature.size()) { this->feature.resize(idx + 1); } @@ -116,6 +172,22 @@ class FeatureNode : public Node { return std::string(buffer, Tsize); } + template + static void parse_value_to_bytes(std::vector::iterator feat_str_begin, + std::vector::iterator feat_str_end, + std::string* output) { + T v; + size_t feat_str_size = feat_str_end - feat_str_begin; + size_t Tsize = sizeof(T) * feat_str_size; + char buffer[Tsize] = {'\0'}; + for (size_t i = 0; i < feat_str_size; i++) { + std::stringstream ss(*(feat_str_begin + i)); + ss >> v; + std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T)); + } + output->assign(buffer); + } + template static std::vector parse_bytes_to_array(std::string feat_str) { T v; @@ -130,8 +202,9 @@ class FeatureNode : public Node { return out; } - protected: +protected: std::vector feature; }; + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index bb7f3f26463d49..e4f8b76f0dff26 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -321,7 +321,9 @@ if(WITH_DISTRIBUTE) device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog index_sampler index_wrapper sampler index_dataset_proto lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method - graph_to_program_pass variable_helper timer monitor heter_service_proto fleet heter_server brpc fleet_executor) + graph_to_program_pass variable_helper timer monitor + heter_service_proto fleet heter_server brpc fleet_executor + graph_gpu_wrapper) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses") if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) set(DISTRIBUTE_COMPILE_FLAGS diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index b63f317aae8932..7962e1591f0faf 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -2065,6 +2065,7 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) { } else { so_parser_name_.clear(); } + gpu_graph_data_generator_.SetConfig(data_feed_desc); } void SlotRecordInMemoryDataFeed::LoadIntoMemory() { @@ -2589,6 +2590,7 @@ bool SlotRecordInMemoryDataFeed::Start() { #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) CHECK(paddle::platform::is_gpu_place(this->place_)); pack_ = BatchGpuPackMgr().get(this->GetPlace(), used_slots_info_); + gpu_graph_data_generator_.AllocResource(this->place_, feed_vec_); #endif return true; } @@ -2596,27 +2598,31 @@ bool SlotRecordInMemoryDataFeed::Start() { int SlotRecordInMemoryDataFeed::Next() { #ifdef _LINUX this->CheckStart(); - - VLOG(3) << "enable heter next: " << offset_index_ - << " batch_offsets: " << batch_offsets_.size(); - if (offset_index_ >= batch_offsets_.size()) { - VLOG(3) << "offset_index: " << offset_index_ + if (!gpu_graph_mode_) { + VLOG(3) << "enable heter next: " << offset_index_ << " batch_offsets: " << batch_offsets_.size(); - return 0; - } - auto& batch = batch_offsets_[offset_index_++]; - this->batch_size_ = batch.second; - VLOG(3) << "batch_size_=" << this->batch_size_ - << ", thread_id=" << thread_id_; - if (this->batch_size_ != 0) { - PutToFeedVec(&records_[batch.first], this->batch_size_); + if (offset_index_ >= batch_offsets_.size()) { + VLOG(3) << "offset_index: " << offset_index_ + << " batch_offsets: " << batch_offsets_.size(); + return 0; + } + auto& batch = batch_offsets_[offset_index_++]; + this->batch_size_ = batch.second; + VLOG(3) << "batch_size_=" << this->batch_size_ + << ", thread_id=" << thread_id_; + if (this->batch_size_ != 0) { + PutToFeedVec(&records_[batch.first], this->batch_size_); + } else { + VLOG(3) << "finish reading for heterps, batch size zero, thread_id=" + << thread_id_; + } + VLOG(3) << "enable heter next: " << offset_index_ + << " batch_offsets: " << batch_offsets_.size() + << " baych_size: " << this->batch_size_; } else { - VLOG(3) << "finish reading for heterps, batch size zero, thread_id=" - << thread_id_; + VLOG(3) << "datafeed in gpu graph mode"; + this->batch_size_ = gpu_graph_data_generator_.GenerateBatch(); } - VLOG(3) << "enable heter next: " << offset_index_ - << " batch_offsets: " << batch_offsets_.size() - << " baych_size: " << this->batch_size_; return this->batch_size_; #else diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu index f9435ec2a32d84..1814fa44da62c4 100644 --- a/paddle/fluid/framework/data_feed.cu +++ b/paddle/fluid/framework/data_feed.cu @@ -17,7 +17,14 @@ limitations under the License. */ #endif #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) +#include +#include +#include +#include +#include "cub/cub.cuh" #include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h" namespace paddle { namespace framework { @@ -144,6 +151,714 @@ void SlotRecordInMemoryDataFeed::CopyForTensor( cudaStreamSynchronize(stream); } +__global__ void GraphFillCVMKernel(int64_t *tensor, int len) { + CUDA_KERNEL_LOOP(idx, len) { tensor[idx] = 1; } +} + +int GraphDataGenerator::AcquireInstance(BufState *state) { + // + if (state->GetNextStep()) { + state->Debug(); + return state->len; + } else if (state->GetNextCentrolWord()) { + state->Debug(); + return state->len; + } else if (state->GetNextBatch()) { + state->Debug(); + return state->len; + } + return 0; +} + +// TODO opt +__global__ void GraphFillFeatureKernel(int64_t *id_tensor, int *fill_ins_num, + int64_t *walk, int64_t *feature, int *row, int central_word, + int step, int len, int col_num, int slot_num) { + __shared__ int64_t local_key[CUDA_NUM_THREADS * 2]; + __shared__ int local_num; + __shared__ int global_num; + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (threadIdx.x == 0) { + local_num = 0; + } + __syncthreads(); + if (idx < len) { + int src = row[idx] * col_num + central_word; + if (walk[src] != 0 && walk[src + step] != 0) { + size_t dst = atomicAdd(&local_num, 1); + for (int i = 0; i < slot_num; ++i) { + local_key[dst * 2 * slot_num + i * 2] = feature[src * slot_num + i]; + local_key[dst * 2 * slot_num + i * 2 + 1] = feature[(src + step) * slot_num + i]; + } + } + } + + if (threadIdx.x == 0) { + global_num = atomicAdd(fill_ins_num, local_num); + } + __syncthreads(); + + if (threadIdx.x < local_num) { + for (int i = 0; i < slot_num; ++i) { + id_tensor[(global_num * 2 + 2 * threadIdx.x) * slot_num + i] + = local_key[(2 * threadIdx.x) * slot_num + i]; + id_tensor[(global_num * 2 + 2 * threadIdx.x + 1) * slot_num + i] = + local_key[(2 * threadIdx.x + 1) * slot_num + i]; + } + } +} + +__global__ void GraphFillIdKernel(int64_t *id_tensor, int *fill_ins_num, + int64_t *walk, int *row, int central_word, + int step, int len, int col_num) { + __shared__ int64_t local_key[CUDA_NUM_THREADS * 2]; + __shared__ int local_num; + __shared__ int global_num; + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (threadIdx.x == 0) { + local_num = 0; + } + __syncthreads(); + // int dst = idx * 2; + // id_tensor[dst] = walk[src]; + // id_tensor[dst + 1] = walk[src + step]; + if (idx < len) { + int src = row[idx] * col_num + central_word; + if (walk[src] != 0 && walk[src + step] != 0) { + size_t dst = atomicAdd(&local_num, 1); + local_key[dst * 2] = walk[src]; + local_key[dst * 2 + 1] = walk[src + step]; + } + } + + if (threadIdx.x == 0) { + global_num = atomicAdd(fill_ins_num, local_num); + } + __syncthreads(); + + if (threadIdx.x < local_num) { + id_tensor[global_num * 2 + 2 * threadIdx.x] = local_key[2 * threadIdx.x]; + id_tensor[global_num * 2 + 2 * threadIdx.x + 1] = + local_key[2 * threadIdx.x + 1]; + } +} + +__global__ void GraphFillSlotLodKernel(int64_t *id_tensor, int len) { + CUDA_KERNEL_LOOP(idx, len) { + id_tensor[idx] = idx; + } +} + +int GraphDataGenerator::FillInsBuf() { + if (ins_buf_pair_len_ >= batch_size_) { + return batch_size_; + } + int total_instance = AcquireInstance(&buf_state_); + + VLOG(2) << "total_ins: " << total_instance; + buf_state_.Debug(); + + if (total_instance == 0) { + int res = FillWalkBuf(d_walk_); + if (!res) { + // graph iterate complete + return -1; + } else { + total_instance = buf_state_.len; + VLOG(2) << "total_ins: " << total_instance; + buf_state_.Debug(); + // if (total_instance == 0) { + // return -1; + //} + } + + if (slot_num_ > 0) { + FillFeatureBuf(d_walk_, d_feature_); + if (debug_mode_) { + int len = buf_size_ > 5000? 5000: buf_size_; + uint64_t h_walk[len]; + cudaMemcpy(h_walk, d_walk_->ptr(), len * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + uint64_t h_feature[len * slot_num_]; + cudaMemcpy(h_feature, d_feature_->ptr(), len * slot_num_ * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + for(int i = 0; i < len; ++i) { + std::stringstream ss; + for (int j = 0; j < slot_num_; ++j) { + ss << h_feature[i * slot_num_ + j] << " "; + } + VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i << "] = " << (uint64_t)h_walk[i] + << " feature[" << i * slot_num_ << ".." << (i + 1) * slot_num_ << "] = " << ss.str(); + } + } + } + } + + int64_t *walk = reinterpret_cast(d_walk_->ptr()); + int64_t *ins_buf = reinterpret_cast(d_ins_buf_->ptr()); + int *random_row = reinterpret_cast(d_random_row_->ptr()); + int *d_pair_num = reinterpret_cast(d_pair_num_->ptr()); + cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_); + int len = buf_state_.len; + GraphFillIdKernel<<>>( + ins_buf + ins_buf_pair_len_ * 2, d_pair_num, walk, + random_row + buf_state_.cursor, buf_state_.central_word, + window_step_[buf_state_.step], len, walk_len_); + int h_pair_num; + cudaMemcpyAsync(&h_pair_num, d_pair_num, sizeof(int), cudaMemcpyDeviceToHost, + stream_); + + int64_t *feature_buf = reinterpret_cast(d_feature_buf_->ptr()); + if (slot_num_ > 0) { + int64_t *feature = reinterpret_cast(d_feature_->ptr()); + cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_); + int len = buf_state_.len; + VLOG(2) << "feature_buf start[" << ins_buf_pair_len_ * 2 * slot_num_ << "] len[" << len << "]"; + GraphFillFeatureKernel<<>>( + feature_buf + ins_buf_pair_len_ * 2 * slot_num_, d_pair_num, walk, feature, + random_row + buf_state_.cursor, buf_state_.central_word, + window_step_[buf_state_.step], len, walk_len_, slot_num_); + } + + cudaStreamSynchronize(stream_); + ins_buf_pair_len_ += h_pair_num; + + if (debug_mode_) { + int64_t *h_ins_buf = new int64_t[ins_buf_pair_len_ * 2]; + cudaMemcpy(h_ins_buf, ins_buf, 2 * ins_buf_pair_len_ * sizeof(int64_t), + cudaMemcpyDeviceToHost); + VLOG(2) << "h_pair_num = " << h_pair_num + << ", ins_buf_pair_len = " << ins_buf_pair_len_; + for (int xx = 0; xx < 2 * ins_buf_pair_len_; xx++) { + VLOG(2) << "h_ins_buf[" << xx << "]: " << h_ins_buf[xx]; + } + delete[] h_ins_buf; + + int64_t h_feature_buf[(batch_size_ * 2 * 2) * slot_num_]; + cudaMemcpy(h_feature_buf, feature_buf, (batch_size_ * 2 * 2) * slot_num_ * sizeof(int64_t), + cudaMemcpyDeviceToHost); + for (int xx = 0; xx < (batch_size_ * 2 * 2) * slot_num_; xx++) { + VLOG(2) << "h_feature_buf[" << xx << "]: " << h_feature_buf[xx]; + } + } + return ins_buf_pair_len_; +} + + +int GraphDataGenerator::GenerateBatch() { + platform::CUDADeviceGuard guard(gpuid_); + int res = 0; + while (ins_buf_pair_len_ < batch_size_) { + res = FillInsBuf(); + if (res == -1) { + if (ins_buf_pair_len_ == 0) { + return 0; + } else { + break; + } + } + } + int total_instance = + ins_buf_pair_len_ < batch_size_ ? ins_buf_pair_len_ : batch_size_; + + total_instance *= 2; + id_tensor_ptr_ = + feed_vec_[0]->mutable_data({total_instance, 1}, this->place_); + show_tensor_ptr_ = + feed_vec_[1]->mutable_data({total_instance}, this->place_); + clk_tensor_ptr_ = + feed_vec_[2]->mutable_data({total_instance}, this->place_); + + int64_t* slot_tensor_ptr_[slot_num_]; + int64_t* slot_lod_tensor_ptr_[slot_num_]; + if (slot_num_ > 0) { + for (int i = 0; i < slot_num_; ++i) { + slot_tensor_ptr_[i] = + feed_vec_[3 + 2 * i]->mutable_data({total_instance, 1}, this->place_); + slot_lod_tensor_ptr_[i] = + feed_vec_[3 + 2 * i + 1]->mutable_data({total_instance + 1}, this->place_); + } + } + + VLOG(2) << "total_instance: " << total_instance + << ", ins_buf_pair_len = " << ins_buf_pair_len_; + int64_t *ins_buf = reinterpret_cast(d_ins_buf_->ptr()); + int64_t *ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance; + cudaMemcpyAsync(id_tensor_ptr_, ins_cursor, sizeof(int64_t) * total_instance, + cudaMemcpyDeviceToDevice, stream_); + + GraphFillCVMKernel<<>>(show_tensor_ptr_, total_instance); + GraphFillCVMKernel<<>>(clk_tensor_ptr_, total_instance); + + if (slot_num_ > 0) { + int64_t *feature_buf = reinterpret_cast(d_feature_buf_->ptr()); + for (int i = 0; i < slot_num_; ++i) { + int feature_buf_offset = (ins_buf_pair_len_ * 2 - total_instance) * slot_num_ + i * 2; + // TODO huwei02 opt + for (int j = 0; j < total_instance; j += 2) { + VLOG(2) << "slot_tensor[" << i << "][" << j << "] <- feature_buf[" + << feature_buf_offset + j * slot_num_ << "]"; + VLOG(2) << "slot_tensor[" << i << "][" << j + 1 << "] <- feature_buf[" + << feature_buf_offset + j * slot_num_ + 1 << "]"; + cudaMemcpyAsync(slot_tensor_ptr_[i] + j, &feature_buf[feature_buf_offset + j * slot_num_], + sizeof(int64_t) * 2, cudaMemcpyDeviceToDevice, stream_); + } + GraphFillSlotLodKernel<<>>( + slot_lod_tensor_ptr_[i], total_instance + 1); + } + } + + offset_.clear(); + offset_.push_back(0); + offset_.push_back(total_instance); + LoD lod{offset_}; + feed_vec_[0]->set_lod(lod); + if (slot_num_ > 0) { + for (int i = 0; i < slot_num_; ++i) { + feed_vec_[3 + 2 * i]->set_lod(lod); + } + } + + ins_buf_pair_len_ -= total_instance / 2; + + cudaStreamSynchronize(stream_); + + if (debug_mode_) { + int64_t h_slot_tensor[slot_num_][total_instance]; + int64_t h_slot_lod_tensor[slot_num_][total_instance + 1]; + for (int i = 0; i < slot_num_; ++i) { + cudaMemcpy(h_slot_tensor[i], slot_tensor_ptr_[i], total_instance * sizeof(int64_t), + cudaMemcpyDeviceToHost); + int len = total_instance > 5000? 5000: total_instance; + for(int j = 0; j < len; ++j) { + VLOG(2) << "gpu[" << gpuid_ << "] slot_tensor[" << i <<"][" << j << "] = " << h_slot_tensor[i][j]; + } + + cudaMemcpy(h_slot_lod_tensor[i], slot_lod_tensor_ptr_[i], (total_instance + 1) * sizeof(int64_t), + cudaMemcpyDeviceToHost); + len = total_instance + 1 > 5000? 5000: total_instance + 1; + for(int j = 0; j < len; ++j) { + VLOG(2) << "gpu[" << gpuid_ << "] slot_lod_tensor[" << i <<"][" << j << "] = " << h_slot_lod_tensor[i][j]; + } + } + } + + return 1; +} + +__global__ void GraphFillSampleKeysKernel(uint64_t *neighbors, + uint64_t *sample_keys, + int *prefix_sum, int *sampleidx2row, + int *tmp_sampleidx2row, + int *actual_sample_size, + int cur_degree, int len) { + CUDA_KERNEL_LOOP(idx, len) { + for (int k = 0; k < actual_sample_size[idx]; k++) { + size_t offset = prefix_sum[idx] + k; + sample_keys[offset] = neighbors[idx * cur_degree + k]; + tmp_sampleidx2row[offset] = sampleidx2row[idx] + k; + } + } +} + +__global__ void GraphDoWalkKernel(uint64_t *neighbors, uint64_t *walk, + int *d_prefix_sum, int *actual_sample_size, + int cur_degree, int step, int len, + int *id_cnt, int *sampleidx2row, + int col_size) { + CUDA_KERNEL_LOOP(i, len) { + for (int k = 0; k < actual_sample_size[i]; k++) { + // int idx = sampleidx2row[i]; + size_t row = sampleidx2row[k + d_prefix_sum[i]]; + // size_t row = idx * cur_degree + k; + size_t col = step; + size_t offset = (row * col_size + col); + walk[offset] = neighbors[i * cur_degree + k]; + } + } +} + +// Fill keys to the first column of walk +__global__ void GraphFillFirstStepKernel(int *prefix_sum, int *sampleidx2row, + uint64_t *walk, uint64_t *keys, + int len, int walk_degree, int col_size, + int *actual_sample_size, + uint64_t *neighbors, + uint64_t *sample_keys) { + CUDA_KERNEL_LOOP(idx, len) { + for (int k = 0; k < actual_sample_size[idx]; k++) { + size_t row = prefix_sum[idx] + k; + sample_keys[row] = neighbors[idx * walk_degree + k]; + sampleidx2row[row] = row; + + size_t offset = col_size * row; + walk[offset] = keys[idx]; + walk[offset + 1] = neighbors[idx * walk_degree + k]; + } + } +} + +// Fill sample_res to the stepth column of walk +void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids, uint64_t *walk, + int len, NeighborSampleResult &sample_res, + int cur_degree, int step, + int *len_per_row) { + size_t temp_storage_bytes = 0; + int *d_actual_sample_size = sample_res.actual_sample_size; + uint64_t *d_neighbors = sample_res.val; + int *d_prefix_sum = reinterpret_cast(d_prefix_sum_->ptr()); + uint64_t *d_sample_keys = reinterpret_cast(d_sample_keys_->ptr()); + int *d_sampleidx2row = + reinterpret_cast(d_sampleidx2rows_[cur_sampleidx2row_]->ptr()); + int *d_tmp_sampleidx2row = + reinterpret_cast(d_sampleidx2rows_[1 - cur_sampleidx2row_]->ptr()); + + CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL, temp_storage_bytes, + d_actual_sample_size, + d_prefix_sum + 1, len, stream_)); + auto d_temp_storage = memory::Alloc(place_, temp_storage_bytes); + + CUDA_CHECK(cub::DeviceScan::InclusiveSum( + d_temp_storage->ptr(), temp_storage_bytes, d_actual_sample_size, + d_prefix_sum + 1, len, stream_)); + + cudaStreamSynchronize(stream_); + + if (step == 1) { + GraphFillFirstStepKernel<<>>( + d_prefix_sum, d_tmp_sampleidx2row, walk, d_start_ids, len, walk_degree_, + walk_len_, d_actual_sample_size, d_neighbors, d_sample_keys); + + } else { + GraphFillSampleKeysKernel<<>>( + d_neighbors, d_sample_keys, d_prefix_sum, d_sampleidx2row, + d_tmp_sampleidx2row, d_actual_sample_size, cur_degree, len); + + GraphDoWalkKernel<<>>( + d_neighbors, walk, d_prefix_sum, d_actual_sample_size, cur_degree, step, + len, len_per_row, d_tmp_sampleidx2row, walk_len_); + } + if (debug_mode_) { + size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_; + int *h_prefix_sum = new int[len + 1]; + int *h_actual_size = new int[len]; + int *h_offset2idx = new int[once_max_sample_keynum]; + int64_t *h_sample_keys = new int64_t[once_max_sample_keynum]; + cudaMemcpy(h_offset2idx, d_tmp_sampleidx2row, + once_max_sample_keynum * sizeof(int), cudaMemcpyDeviceToHost); + + cudaMemcpy(h_prefix_sum, d_prefix_sum, (len + 1) * sizeof(int), + cudaMemcpyDeviceToHost); + for (int xx = 0; xx < once_max_sample_keynum; xx++) { + VLOG(2) << "h_offset2idx[" << xx << "]: " << h_offset2idx[xx]; + } + for (int xx = 0; xx < len + 1; xx++) { + VLOG(2) << "h_prefix_sum[" << xx << "]: " << h_prefix_sum[xx]; + } + delete[] h_prefix_sum; + delete[] h_actual_size; + delete[] h_offset2idx; + delete[] h_sample_keys; + } + cudaStreamSynchronize(stream_); + cur_sampleidx2row_ = 1 - cur_sampleidx2row_; +} + +int GraphDataGenerator::FillFeatureBuf(std::shared_ptr d_walk, + std::shared_ptr d_feature) { + platform::CUDADeviceGuard guard(gpuid_); + + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + int ret = gpu_graph_ptr->get_feature_of_nodes(gpuid_, d_walk, d_feature, buf_size_, slot_num_); + return ret; +} + +int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { + platform::CUDADeviceGuard guard(gpuid_); + size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_; + //////// + uint64_t *h_walk; + uint64_t *h_sample_keys; + int *h_offset2idx; + int *h_len_per_row; + uint64_t *h_prefix_sum; + if (debug_mode_) { + h_walk = new uint64_t[buf_size_]; + h_sample_keys = new uint64_t[once_max_sample_keynum]; + h_offset2idx = new int[once_max_sample_keynum]; + h_len_per_row = new int[once_max_sample_keynum]; + h_prefix_sum = new uint64_t[once_max_sample_keynum + 1]; + } + /////// + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + uint64_t *walk = reinterpret_cast(d_walk->ptr()); + int *len_per_row = reinterpret_cast(d_len_per_row_->ptr()); + uint64_t *d_sample_keys = reinterpret_cast(d_sample_keys_->ptr()); + cudaMemsetAsync(walk, 0, buf_size_ * sizeof(uint64_t), stream_); + cudaMemsetAsync(len_per_row, 0, once_max_sample_keynum * sizeof(int), + stream_); + int i = 0; + int total_row = 0; + size_t node_type_len = first_node_type_.size(); + int remain_size = + buf_size_ - walk_degree_ * once_sample_startid_len_ * walk_len_; + + while (i <= remain_size) { + int cur_node_idx = cursor_ % node_type_len; + int node_type = first_node_type_[cur_node_idx]; + auto &path = meta_path_[cur_node_idx]; + size_t start = node_type_start_[node_type]; + // auto node_query_result = gpu_graph_ptr->query_node_list( + // gpuid_, node_type, start, once_sample_startid_len_); + + // int tmp_len = node_query_result.actual_sample_size; + VLOG(2) << "choose start type: " << node_type; + int type_index = type_to_index_[node_type]; + size_t device_key_size = h_device_keys_[type_index]->size(); + VLOG(2) << "type: " << node_type << " size: " << device_key_size + << " start: " << start; + uint64_t *d_type_keys = + reinterpret_cast(d_device_keys_[type_index]->ptr()); + int tmp_len = start + once_sample_startid_len_ > device_key_size + ? device_key_size - start + : once_sample_startid_len_; + node_type_start_[node_type] = tmp_len + start; + if (tmp_len == 0) { + finish_node_type_.insert(node_type); + if (finish_node_type_.size() == node_type_start_.size()) { + break; + } + cursor_ += 1; + continue; + } + // if (tmp_len == 0) { + // break; + //} + VLOG(2) << "i = " << i << " buf_size_ = " << buf_size_ + << " tmp_len = " << tmp_len << " cursor = " << cursor_ + << " once_max_sample_keynum = " << once_max_sample_keynum; + uint64_t *cur_walk = walk + i; + + NeighborSampleQuery q; + q.initialize(gpuid_, path[0], (uint64_t)(d_type_keys + start), walk_degree_, + tmp_len); + auto sample_res = gpu_graph_ptr->graph_neighbor_sample_v3(q, false); + + int step = 1; + VLOG(2) << "sample edge type: " << path[0] << " step: " << 1; + jump_rows_ = sample_res.total_sample_size; + FillOneStep(d_type_keys + start, cur_walk, tmp_len, sample_res, + walk_degree_, step, len_per_row); + VLOG(2) << "jump_row: " << jump_rows_; + ///////// + if (debug_mode_) { + cudaMemcpy(h_walk, walk, buf_size_ * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + for (int xx = 0; xx < buf_size_; xx++) { + VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx]; + } + } + ///////// + step++; + size_t path_len = path.size(); + for (; step < walk_len_; step++) { + if (sample_res.total_sample_size == 0) { + break; + } + auto sample_key_mem = sample_res.actual_val_mem; + uint64_t *sample_keys_ptr = + reinterpret_cast(sample_key_mem->ptr()); + int edge_type_id = path[(step - 1) % path_len]; + VLOG(2) << "sample edge type: " << edge_type_id << " step: " << step; + q.initialize(gpuid_, edge_type_id, (uint64_t)sample_keys_ptr, 1, + sample_res.total_sample_size); + sample_res = gpu_graph_ptr->graph_neighbor_sample_v3(q, false); + + FillOneStep(d_type_keys + start, cur_walk, sample_res.total_sample_size, + sample_res, 1, step, len_per_row); + if (debug_mode_) { + cudaMemcpy(h_walk, walk, buf_size_ * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + for (int xx = 0; xx < buf_size_; xx++) { + VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx]; + } + } + } + // cursor_ += tmp_len; + i += jump_rows_ * walk_len_; + total_row += jump_rows_; + cursor_ += 1; + } + buf_state_.Reset(total_row); + int *d_random_row = reinterpret_cast(d_random_row_->ptr()); + + thrust::random::default_random_engine engine(shuffle_seed_); + const auto &exec_policy = thrust::cuda::par.on(stream_); + thrust::counting_iterator cnt_iter(0); + thrust::shuffle_copy(exec_policy, cnt_iter, cnt_iter + total_row, + thrust::device_pointer_cast(d_random_row), engine); + + cudaStreamSynchronize(stream_); + shuffle_seed_ = engine(); + + if (debug_mode_) { + int *h_random_row = new int[total_row + 10]; + cudaMemcpy(h_random_row, d_random_row, total_row * sizeof(int), + cudaMemcpyDeviceToHost); + for (int xx = 0; xx < total_row; xx++) { + VLOG(2) << "h_random_row[" << xx << "]: " << h_random_row[xx]; + } + delete[] h_random_row; + delete[] h_walk; + delete[] h_sample_keys; + delete[] h_offset2idx; + delete[] h_len_per_row; + delete[] h_prefix_sum; + } + return total_row != 0; +} + +void GraphDataGenerator::AllocResource(const paddle::platform::Place &place, + std::vector feed_vec) { + place_ = place; + gpuid_ = place_.GetDeviceId(); + VLOG(3) << "gpuid " << gpuid_; + stream_ = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + feed_vec_ = feed_vec; + slot_num_ = (feed_vec_.size() - 3) / 2; + + // d_device_keys_.resize(h_device_keys_.size()); + VLOG(2) << "h_device_keys size: " << h_device_keys_.size(); + + for (size_t i = 0; i < h_device_keys_.size(); i++) { + for (size_t j = 0; j < h_device_keys_[i]->size(); j++) { + VLOG(3) << "h_device_keys_[" << i << "][" << j + << "] = " << (*(h_device_keys_[i]))[j]; + } + auto buf = memory::AllocShared( + place_, h_device_keys_[i]->size() * sizeof(uint64_t)); + d_device_keys_.push_back(buf); + CUDA_CHECK(cudaMemcpyAsync(buf->ptr(), h_device_keys_[i]->data(), + h_device_keys_[i]->size() * sizeof(uint64_t), + cudaMemcpyHostToDevice, stream_)); + } + // h_device_keys_ = h_device_keys; + // device_key_size_ = h_device_keys_->size(); + // d_device_keys_ = + // memory::AllocShared(place_, device_key_size_ * sizeof(int64_t)); + // CUDA_CHECK(cudaMemcpyAsync(d_device_keys_->ptr(), h_device_keys_->data(), + // device_key_size_ * sizeof(int64_t), + // cudaMemcpyHostToDevice, stream_)); + size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_; + d_prefix_sum_ = + memory::AllocShared(place_, (once_max_sample_keynum + 1) * sizeof(int)); + int *d_prefix_sum_ptr = reinterpret_cast(d_prefix_sum_->ptr()); + cudaMemsetAsync(d_prefix_sum_ptr, 0, + (once_max_sample_keynum + 1) * sizeof(int), stream_); + cursor_ = 0; + jump_rows_ = 0; + d_walk_ = memory::AllocShared(place_, buf_size_ * sizeof(uint64_t)); + cudaMemsetAsync(d_walk_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_); + d_feature_ = memory::AllocShared(place_, buf_size_ * slot_num_ * sizeof(uint64_t)); + cudaMemsetAsync(d_feature_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_); + d_sample_keys_ = + memory::AllocShared(place_, once_max_sample_keynum * sizeof(uint64_t)); + + d_sampleidx2rows_.push_back( + memory::AllocShared(place_, once_max_sample_keynum * sizeof(int))); + d_sampleidx2rows_.push_back( + memory::AllocShared(place_, once_max_sample_keynum * sizeof(int))); + cur_sampleidx2row_ = 0; + + d_len_per_row_ = + memory::AllocShared(place_, once_max_sample_keynum * sizeof(int)); + for (int i = -window_; i < 0; i++) { + window_step_.push_back(i); + } + for (int i = 0; i < window_; i++) { + window_step_.push_back(i + 1); + } + buf_state_.Init(batch_size_, walk_len_, &window_step_); + d_random_row_ = memory::AllocShared( + place_, + (once_sample_startid_len_ * walk_degree_ * repeat_time_) * sizeof(int)); + shuffle_seed_ = 0; + + ins_buf_pair_len_ = 0; + d_ins_buf_ = + memory::AllocShared(place_, (batch_size_ * 2 * 2) * sizeof(int64_t)); + d_feature_buf_ = + memory::AllocShared(place_, (batch_size_ * 2 * 2) * slot_num_ * sizeof(int64_t)); + d_pair_num_ = memory::AllocShared(place_, sizeof(int)); + + cudaStreamSynchronize(stream_); +} + +void GraphDataGenerator::SetConfig( + const paddle::framework::DataFeedDesc &data_feed_desc) { + auto graph_config = data_feed_desc.graph_config(); + walk_degree_ = graph_config.walk_degree(); + walk_len_ = graph_config.walk_len(); + window_ = graph_config.window(); + once_sample_startid_len_ = graph_config.once_sample_startid_len(); + debug_mode_ = graph_config.debug_mode(); + if (debug_mode_) { + batch_size_ = graph_config.batch_size(); + } else { + batch_size_ = once_sample_startid_len_; + } + repeat_time_ = graph_config.sample_times_one_chunk(); + buf_size_ = + once_sample_startid_len_ * walk_len_ * walk_degree_ * repeat_time_; + VLOG(2) << "Confirm GraphConfig, walk_degree : " << walk_degree_ + << ", walk_len : " << walk_len_ << ", window : " << window_ + << ", once_sample_startid_len : " << once_sample_startid_len_ + << ", sample_times_one_chunk : " << repeat_time_ + << ", batch_size: " << batch_size_; + std::string first_node_type = graph_config.first_node_type(); + std::string meta_path = graph_config.meta_path(); + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + auto edge_to_id = gpu_graph_ptr->edge_to_id; + auto node_to_id = gpu_graph_ptr->feature_to_id; + // parse first_node_type + auto node_types = + paddle::string::split_string(first_node_type, ";"); + VLOG(2) << "node_types: " << first_node_type; + finish_node_type_.clear(); + node_type_start_.clear(); + for (auto &type : node_types) { + auto iter = node_to_id.find(type); + PADDLE_ENFORCE_NE( + iter, node_to_id.end(), + platform::errors::NotFound("(%s) is not found in node_to_id.", type)); + VLOG(2) << "node_to_id[" << type << "] = " << iter->second; + first_node_type_.push_back(iter->second); + node_type_start_[iter->second] = 0; + } + meta_path_.resize(first_node_type_.size()); + auto meta_paths = paddle::string::split_string(meta_path, ";"); + + for (size_t i = 0; i < meta_paths.size(); i++) { + auto path = meta_paths[i]; + auto nodes = paddle::string::split_string(path, "-"); + for (auto &node : nodes) { + auto iter = edge_to_id.find(node); + PADDLE_ENFORCE_NE( + iter, edge_to_id.end(), + platform::errors::NotFound("(%s) is not found in edge_to_id.", node)); + VLOG(2) << "edge_to_id[" << node << "] = " << iter->second; + meta_path_[i].push_back(iter->second); + } + } +}; + } // namespace framework } // namespace paddle #endif diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 6f7f1dac52804f..9c44de182e1587 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -23,6 +23,7 @@ limitations under the License. */ #include // NOLINT #include #include // NOLINT +#include #include #include #include // NOLINT @@ -56,6 +57,8 @@ namespace framework { class DataFeedDesc; class Scope; class Variable; +class NeighborSampleResult; +class NodeQueryResult; } // namespace framework } // namespace paddle @@ -774,6 +777,190 @@ class DLManager { std::map handle_map_; }; +struct engine_wrapper_t { + std::default_random_engine engine; + engine_wrapper_t() { + struct timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + double cur_time = tp.tv_sec + tp.tv_nsec * 1e-9; + static std::atomic x(0); + std::seed_seq sseq = {x++, x++, x++, (uint64_t)(cur_time * 1000)}; + engine.seed(sseq); + } +}; + +struct BufState { + int left; + int right; + int central_word; + int step; + engine_wrapper_t random_engine_; + + int len; + int cursor; + int row_num; + + int batch_size; + int walk_len; + std::vector* window; + + BufState() {} + ~BufState() {} + + void Init(int graph_batch_size, int graph_walk_len, + std::vector* graph_window) { + batch_size = graph_batch_size; + walk_len = graph_walk_len; + window = graph_window; + + left = 0; + right = window->size() - 1; + central_word = -1; + step = -1; + + len = 0; + cursor = 0; + row_num = 0; + for (size_t i = 0; i < graph_window->size(); i++) { + VLOG(2) << "graph_window[" << i << "] = " << (*graph_window)[i]; + } + } + + void Reset(int total_rows) { + cursor = 0; + row_num = total_rows; + int tmp_len = cursor + batch_size > row_num ? row_num - cursor : batch_size; + len = tmp_len; + central_word = -1; + step = -1; + GetNextCentrolWord(); + } + + int GetNextStep() { + step++; + if (step <= right && central_word + (*window)[step] < walk_len) { + return 1; + } + return 0; + } + + void Debug() { + VLOG(2) << "left: " << left << " right: " << right + << " central_word: " << central_word << " step: " << step + << " cursor: " << cursor << " len: " << len + << " row_num: " << row_num; + } + + int GetNextCentrolWord() { + if (++central_word >= walk_len) { + return 0; + } + int window_size = window->size() / 2; + int random_window = random_engine_.engine() % window_size + 1; + left = window_size - random_window; + right = window_size + random_window - 1; + VLOG(2) << "random window: " << random_window << " window[" << left + << "] = " << (*window)[left] << " window[" << right + << "] = " << (*window)[right]; + + for (step = left; step <= right; step++) { + if (central_word + (*window)[step] >= 0) { + return 1; + } + } + return 0; + } + + int GetNextBatch() { + cursor += len; + int tmp_len = cursor + batch_size > row_num ? row_num - cursor : batch_size; + if (tmp_len == 0) { + return 0; + } + len = tmp_len; + central_word = -1; + step = -1; + GetNextCentrolWord(); + return tmp_len != 0; + } +}; + +class GraphDataGenerator { + public: + GraphDataGenerator(){}; + virtual ~GraphDataGenerator(){}; + void SetConfig(const paddle::framework::DataFeedDesc& data_feed_desc); + void AllocResource(const paddle::platform::Place& place, + std::vector feed_vec); + int AcquireInstance(BufState* state); + int GenerateBatch(); + int FillWalkBuf(std::shared_ptr d_walk); + int FillFeatureBuf(std::shared_ptr d_walk, + std::shared_ptr d_feature); + void FillOneStep(uint64_t* start_ids, uint64_t* walk, int len, + NeighborSampleResult& sample_res, int cur_degree, int step, + int* len_per_row); + int FillInsBuf(); + void SetDeviceKeys(std::vector* device_keys, int type) { + type_to_index_[type] = h_device_keys_.size(); + h_device_keys_.push_back(device_keys); + } + + protected: + int walk_degree_; + int walk_len_; + int window_; + int once_sample_startid_len_; + int gpuid_; + // start ids + // int64_t* device_keys_; + // size_t device_key_size_; + std::vector*> h_device_keys_; + std::unordered_map type_to_index_; + // point to device_keys_ + size_t cursor_; + size_t jump_rows_; + int64_t* id_tensor_ptr_; + int64_t* show_tensor_ptr_; + int64_t* clk_tensor_ptr_; + cudaStream_t stream_; + paddle::platform::Place place_; + std::vector feed_vec_; + std::vector offset_; + std::shared_ptr d_prefix_sum_; + std::vector> d_device_keys_; + + std::shared_ptr d_walk_; + std::shared_ptr d_feature_; + std::shared_ptr d_len_per_row_; + std::shared_ptr d_random_row_; + // + std::vector> d_sampleidx2rows_; + int cur_sampleidx2row_; + // record the keys to call graph_neighbor_sample + std::shared_ptr d_sample_keys_; + int sample_keys_len_; + + std::set finish_node_type_; + std::unordered_map node_type_start_; + + std::shared_ptr d_ins_buf_; + std::shared_ptr d_feature_buf_; + std::shared_ptr d_pair_num_; + int ins_buf_pair_len_; + // size of a d_walk buf + size_t buf_size_; + int repeat_time_; + std::vector window_step_; + BufState buf_state_; + int batch_size_; + int slot_num_; + int shuffle_seed_; + int debug_mode_; + std::vector first_node_type_; + std::vector> meta_path_; +}; + class DataFeed { public: DataFeed() { @@ -836,6 +1023,12 @@ class DataFeed { virtual void SetParseLogKey(bool parse_logkey) {} virtual void SetEnablePvMerge(bool enable_pv_merge) {} virtual void SetCurrentPhase(int current_phase) {} + virtual void SetDeviceKeys(std::vector* device_keys, int type) { + gpu_graph_data_generator_.SetDeviceKeys(device_keys, type); + } + virtual void SetGpuGraphMode(int gpu_graph_mode) { + gpu_graph_mode_ = gpu_graph_mode; + } virtual void SetFileListMutex(std::mutex* mutex) { mutex_for_pick_file_ = mutex; } @@ -919,6 +1112,8 @@ class DataFeed { // The input type of pipe reader, 0 for one sample, 1 for one batch int input_type_; + int gpu_graph_mode_ = 0; + GraphDataGenerator gpu_graph_data_generator_; }; // PrivateQueueDataFeed is the base virtual class for ohther DataFeeds. diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto index 6964446f20946f..fe606630f92188 100644 --- a/paddle/fluid/framework/data_feed.proto +++ b/paddle/fluid/framework/data_feed.proto @@ -27,6 +27,18 @@ message MultiSlotDesc { optional string uid_slot = 2; } +message GraphConfig { + optional int32 walk_degree = 1 [ default = 1 ]; + optional int32 walk_len = 2 [ default = 20 ]; + optional int32 window = 3 [ default = 5 ]; + optional int32 once_sample_startid_len = 4 [ default = 8000 ]; + optional int32 sample_times_one_chunk = 5 [ default = 10 ]; + optional int32 batch_size = 6 [ default = 1 ]; + optional int32 debug_mode = 7 [ default = 0 ]; + optional string first_node_type = 8; + optional string meta_path = 9; +} + message DataFeedDesc { optional string name = 1; optional int32 batch_size = 2 [ default = 32 ]; @@ -37,4 +49,5 @@ message DataFeedDesc { optional int32 pv_batch_size = 7 [ default = 32 ]; optional int32 input_type = 8 [ default = 0 ]; optional string so_parser_name = 9; + optional GraphConfig graph_config = 10; } diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index b4ae9949f2c6e6..beb0cc316da4b0 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -25,6 +25,7 @@ #ifdef PADDLE_WITH_PSCORE #include "paddle/fluid/distributed/ps/wrapper/fleet.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h" #endif #if defined _WIN32 || defined __APPLE__ @@ -120,6 +121,24 @@ void DatasetImpl::SetDataFeedDesc(const std::string& data_feed_desc_str) { &data_feed_desc_); } +template +std::vector DatasetImpl::GetSlots() { + auto multi_slot_desc = data_feed_desc_.multi_slot_desc(); + use_slots_.clear(); + for (int i = 0; i < multi_slot_desc.slots_size(); ++i) { + const auto& slot = multi_slot_desc.slots(i); + if (slot.type() == "uint64" || slot.type() == "uint32") { + use_slots_.push_back(slot.name()); + } + } + std::cout << "dataset use slots: "; + for (auto s : use_slots_) { + std::cout << s << " | "; + } + std::cout << " end " << std::endl; + return use_slots_; +} + template void DatasetImpl::SetChannelNum(int channel_num) { channel_num_ = channel_num; @@ -302,12 +321,11 @@ static int compute_thread_batch_nccl( thread_avg_batch_num = static_cast(offset.size() / thr_num); #ifdef PADDLE_WITH_GLOO auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance(); - if (!gloo_wrapper->IsInitialized()) { - VLOG(0) << "GLOO is not inited"; - gloo_wrapper->Init(); - } - if (gloo_wrapper->Size() > 1) { + if (!gloo_wrapper->IsInitialized()) { + VLOG(0) << "GLOO is not inited"; + gloo_wrapper->Init(); + } // adjust batch num per thread for NCCL std::vector thread_avg_batch_num_vec(1, thread_avg_batch_num); std::vector total_instance_num_vec(1, total_instance_num); @@ -409,6 +427,18 @@ void MultiSlotDataset::PrepareTrain() { return; } +template +void DatasetImpl::SetGraphDeviceKeys( + const std::vector& h_device_keys) { + // for (size_t i = 0; i < gpu_graph_device_keys_.size(); i++) { + // gpu_graph_device_keys_[i].clear(); + // } + // size_t device_num = gpu_graph_device_keys_.size(); + // for (size_t i = 0; i < h_device_keys.size(); i++) { + // int shard = h_device_keys[i] % device_num; + // gpu_graph_device_keys_[shard].push_back(h_device_keys[i]); + // } +} // load data into memory, Dataset hold this memory, // which will later be fed into readers' channel template @@ -417,12 +447,70 @@ void DatasetImpl::LoadIntoMemory() { platform::Timer timeline; timeline.Start(); std::vector load_threads; - for (int64_t i = 0; i < thread_num_; ++i) { - load_threads.push_back(std::thread( - &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get())); - } - for (std::thread& t : load_threads) { - t.join(); + if (gpu_graph_mode_) { + VLOG(0) << "in gpu_graph_mode"; + graph_all_type_total_keys_.clear(); + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + auto node_to_id = gpu_graph_ptr->feature_to_id; + auto edge_to_id = gpu_graph_ptr->edge_to_id; + graph_all_type_total_keys_.resize(node_to_id.size()); + int cnt = 0; + for (auto& iter : node_to_id) { + int node_idx = iter.second; + auto gpu_graph_device_keys = + gpu_graph_ptr->get_all_id(1, node_idx, thread_num_); + auto& type_total_key = graph_all_type_total_keys_[cnt]; + type_total_key.resize(thread_num_); + for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) { + VLOG(2) << "node type: " << node_idx << ", gpu_graph_device_keys[" << i + << "] = " << gpu_graph_device_keys[i].size(); + for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) { + gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]); + type_total_key[i].push_back(gpu_graph_device_keys[i][j]); + } + } + for (size_t i = 0; i < readers_.size(); i++) { + readers_[i]->SetDeviceKeys(&type_total_key[i], node_idx); + readers_[i]->SetGpuGraphMode(gpu_graph_mode_); + } + cnt++; + } + //TODO(huwei02): open it when slot fea ready + //for (auto& iter : node_to_id) { + // int node_idx = iter.second; + // auto gpu_graph_device_keys = + // gpu_graph_ptr->get_all_feature_ids(1, node_idx, thread_num_); + // for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) { + // VLOG(2) << "node type: " << node_idx << ", gpu_graph_device_keys[" << i + // << "] = " << gpu_graph_device_keys[i].size(); + // for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) { + // gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]); + // } + // } + //} + + // FIX: trick for iterate edge table + for (auto& iter : edge_to_id) { + int edge_idx = iter.second; + auto gpu_graph_device_keys = + gpu_graph_ptr->get_all_id(0, edge_idx, thread_num_); + for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) { + VLOG(1) << "edge type: " << edge_idx << ", gpu_graph_device_keys[" << i + << "] = " << gpu_graph_device_keys[i].size(); + for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) { + gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]); + } + } + } + + } else { + for (int64_t i = 0; i < thread_num_; ++i) { + load_threads.push_back(std::thread( + &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get())); + } + for (std::thread& t : load_threads) { + t.join(); + } } input_channel_->Close(); int64_t in_chan_size = input_channel_->Size(); diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h index 3f10cd7765bc1f..0d326d3fd1364a 100644 --- a/paddle/fluid/framework/data_set.h +++ b/paddle/fluid/framework/data_set.h @@ -152,12 +152,16 @@ class Dataset { virtual void DestroyPreLoadReaders() = 0; // set preload thread num virtual void SetPreLoadThreadNum(int thread_num) = 0; - // separate train thread and dataset thread + // seperate train thread and dataset thread virtual void DynamicAdjustChannelNum(int channel_num, bool discard_remaining_ins = false) = 0; virtual void DynamicAdjustReadersNum(int thread_num) = 0; // set fleet send sleep seconds virtual void SetFleetSendSleepSeconds(int seconds) = 0; + virtual void SetGraphDeviceKeys( + const std::vector& h_device_keys) = 0; + + virtual std::vector GetSlots() = 0; protected: virtual int ReceiveFromClient(int msg_type, int client_id, @@ -238,6 +242,7 @@ class DatasetImpl : public Dataset { int read_thread_num, int consume_thread_num, int shard_num) {} + virtual void SetGraphDeviceKeys(const std::vector& h_device_keys); virtual void ClearLocalTables() {} virtual void CreatePreLoadReaders(); virtual void DestroyPreLoadReaders(); @@ -246,6 +251,7 @@ class DatasetImpl : public Dataset { bool discard_remaining_ins = false); virtual void DynamicAdjustReadersNum(int thread_num); virtual void SetFleetSendSleepSeconds(int seconds); + virtual std::vector GetSlots(); /* for enable_heterps_ virtual void EnableHeterps(bool enable_heterps) { enable_heterps_ = enable_heterps; @@ -263,7 +269,9 @@ class DatasetImpl : public Dataset { return multi_consume_channel_; } } - + std::vector& GetGpuGraphTotalKeys() { + return gpu_graph_total_keys_; + } Channel& GetInputChannelRef() { return input_channel_; } protected: @@ -321,7 +329,12 @@ class DatasetImpl : public Dataset { int64_t global_index_ = 0; std::vector> consume_task_pool_; std::vector input_records_; // only for paddleboxdatafeed + std::vector use_slots_; bool enable_heterps_ = false; + int gpu_graph_mode_ = 1; + // std::vector> gpu_graph_device_keys_; + std::vector>> graph_all_type_total_keys_; + std::vector gpu_graph_total_keys_; }; // use std::vector or Record as data type diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 4fddfca5d805ac..37ec4666a30d67 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -69,7 +69,7 @@ void FleetWrapper::InitWorker(const std::string& dist_desc, int node_num, int index) { #ifdef PADDLE_WITH_PSLIB if (!is_initialized_) { - VLOG(3) << "Going to init worker"; + VLOG(0) << "Going to init worker"; pslib_ptr_ = std::shared_ptr( new paddle::distributed::PSlib()); pslib_ptr_->init_worker(dist_desc, @@ -126,7 +126,7 @@ void FleetWrapper::GatherServers(const std::vector& host_sign_list, void FleetWrapper::GatherClients(const std::vector& host_sign_list) { #ifdef PADDLE_WITH_PSLIB - VLOG(3) << "Going to gather client ips"; + VLOG(0) << "Going to gather client ips"; size_t len = host_sign_list.size(); pslib_ptr_->gather_clients(const_cast(host_sign_list.data()), len); #endif @@ -142,7 +142,7 @@ std::vector FleetWrapper::GetClientsInfo() { void FleetWrapper::CreateClient2ClientConnection() { #ifdef PADDLE_WITH_PSLIB - VLOG(3) << "Going to create client2client connection"; + VLOG(0) << "Going to create client2client connection"; pslib_ptr_->create_client2client_connection(client2client_request_timeout_ms_, client2client_connect_timeout_ms_, client2client_max_retry_); @@ -1054,7 +1054,8 @@ void FleetWrapper::PushSparseFromTensorWithLabelAsync( int slot_offset = 0; int grad_dim = 0; // don't worry, user do not have to care about all these flags - if (accesor == "DownpourCtrAccessor") { + if (accesor == "DownpourCtrAccessor" || + accesor == "DownpourCtrDymfAccessor") { dump_slot = true; slot_offset = 1; grad_dim = fea_dim - 2; diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h index 3fdcf2379cb54a..823b60c5ef1f24 100644 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -95,24 +95,6 @@ class HeterContext { } void SetShardNum(uint32_t shard_num) { shard_num_ = shard_num; } uint32_t ShardNum() { return shard_num_; } - void init(int shard_num, int device_num) { - shard_num_ = shard_num; - feature_keys_.resize(shard_num_); - value_ptr_.resize(shard_num_); - device_task_ptr_.resize(shard_num_); - device_task_keys_.resize(shard_num_); - for (size_t i = 0; i < device_task_ptr_.size(); i++) { - device_task_ptr_[i].resize(device_num); - device_task_keys_[i].resize(device_num); - } - - device_values_.resize(device_num); - device_keys_.resize(device_num); - mutex_.resize(device_num); - for (size_t i = 0; i < mutex_.size(); ++i) { - mutex_[i] = new std::mutex(); - } - } void init(int shard_num, int device_num, int dim_num) { shard_num_ = shard_num; @@ -129,11 +111,6 @@ class HeterContext { for (size_t i = 0; i < feature_dim_keys_.size(); i++) { feature_dim_keys_[i].resize(dim_num); value_dim_ptr_[i].resize(dim_num); - if (i == 0) { - for (int j = 0; j < dim_num; j++) { - feature_dim_keys_[i][j].push_back(0); - } - } } device_values_.resize(device_num); device_dim_values_.resize(device_num); diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index b633394e7a8117..cb7f3a40d6720b 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -32,17 +32,33 @@ struct FeatureValue { float lr; float lr_g2sum; int mf_size; - float mf[MF_DIM + 1]; + int mf_dim; uint64_t cpu_ptr; + float mf[0]; friend std::ostream& operator<<(std::ostream& out, FeatureValue& val) { out << "show: " << val.show << " clk: " << val.clk << " slot: " << val.slot - << " lr: " << val.lr << " mf_size: " << val.mf_size << " mf:"; - for (int i = 0; i < val.mf_size; ++i) { + << " lr: " << val.lr << " mf_dim: " << val.mf_dim + << "cpuptr: " << val.cpu_ptr << " mf_size: " << val.mf_size << " mf:"; + for (int i = 0; i < val.mf_dim + 1; ++i) { out << " " << val.mf[i]; } return out; } + __device__ __forceinline__ void operator=(const FeatureValue& in) { + delta_score = in.delta_score; + show = in.show; + clk = in.clk; + slot = in.slot; + lr = in.lr; + lr_g2sum = in.lr_g2sum; + mf_size = in.mf_size; + mf_dim = in.mf_dim; + cpu_ptr = in.cpu_ptr; + for (int i = 0; i < mf_dim + 1; i++) { + mf[i] = in.mf[i]; + } + } }; struct FeaturePushValue { @@ -50,20 +66,33 @@ struct FeaturePushValue { float clk; int slot; float lr_g; - float mf_g[MF_DIM]; + int mf_dim; + float mf_g[0]; - // __device__ __forceinline__ FeaturePushValue - // operator+(const FeaturePushValue& a) const { - // FeaturePushValue out; - // out.slot = a.slot; - // out.show = a.show + show; - // out.clk = a.clk + clk; - // out.lr_g = a.lr_g + lr_g; - // for (int i = 0; i < MF_DIM; ++i) { - // out.mf_g[i] = a.mf_g[i] + mf_g[i]; - // } - // return out; - // } + __device__ __forceinline__ FeaturePushValue + operator+(const FeaturePushValue& a) const { + FeaturePushValue out; + out.slot = a.slot; + out.mf_dim = a.mf_dim; + out.show = a.show + show; + out.clk = a.clk + clk; + out.lr_g = a.lr_g + lr_g; + // out.mf_g = a.mf_g; + for (int i = 0; i < out.mf_dim; ++i) { + out.mf_g[i] = a.mf_g[i] + mf_g[i]; + } + return out; + } + __device__ __forceinline__ void operator=(const FeaturePushValue& in) { + show = in.show; + clk = in.clk; + slot = in.slot; + lr_g = in.lr_g; + mf_dim = in.mf_dim; + for (int i = 0; i < mf_dim; i++) { + mf_g[i] = in.mf_g[i]; + } + } }; } // end namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h index 19c355c671a386..dcdca8944b1424 100644 --- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h +++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h @@ -20,23 +20,24 @@ #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/phi/core/enforce.h" namespace paddle { namespace framework { struct GpuPsGraphNode { - int64_t node_id; + uint64_t node_id; int64_t neighbor_size, neighbor_offset; // this node's neighbor is stored on [neighbor_offset,neighbor_offset + // neighbor_size) of int64_t *neighbor_list; }; struct GpuPsCommGraph { - int64_t *neighbor_list; + uint64_t *neighbor_list; GpuPsGraphNode *node_list; int64_t neighbor_size, node_size; // the size of neighbor array and graph_node_list array GpuPsCommGraph() : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {} - GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_, + GpuPsCommGraph(uint64_t *neighbor_list_, GpuPsGraphNode *node_list_, int64_t neighbor_size_, int64_t node_size_) : neighbor_list(neighbor_list_), node_list(node_list_), @@ -45,7 +46,7 @@ struct GpuPsCommGraph { void init_on_cpu(int64_t neighbor_size, int64_t node_size) { this->neighbor_size = neighbor_size; this->node_size = node_size; - this->neighbor_list = new int64_t[neighbor_size]; + this->neighbor_list = new uint64_t[neighbor_size]; this->node_list = new paddle::framework::GpuPsGraphNode[node_size]; } void release_on_cpu() { @@ -55,15 +56,15 @@ struct GpuPsCommGraph { void display_on_cpu() { VLOG(0) << "neighbor_size = " << neighbor_size; VLOG(0) << "node_size = " << node_size; - for (size_t i = 0; i < neighbor_size; i++) { + for (int64_t i = 0; i < neighbor_size; i++) { VLOG(0) << "neighbor " << i << " " << neighbor_list[i]; } - for (size_t i = 0; i < node_size; i++) { + for (int64_t i = 0; i < node_size; i++) { VLOG(0) << "node i " << node_list[i].node_id << " neighbor_size = " << node_list[i].neighbor_size; std::string str; int offset = node_list[i].neighbor_offset; - for (size_t j = 0; j < node_list[i].neighbor_size; j++) { + for (int64_t j = 0; j < node_list[i].neighbor_size; j++) { if (j > 0) str += ","; str += std::to_string(neighbor_list[j + offset]); } @@ -123,21 +124,25 @@ node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15 */ struct NeighborSampleQuery { int gpu_id; - int64_t *key; - int sample_size; + int table_idx; + uint64_t *src_nodes; int len; - void initialize(int gpu_id, int64_t key, int sample_size, int len) { + int sample_size; + void initialize(int gpu_id, int table_idx, uint64_t src_nodes, + int sample_size, int len) { + this->table_idx = table_idx; this->gpu_id = gpu_id; - this->key = (int64_t *)key; + this->src_nodes = (uint64_t *)src_nodes; this->sample_size = sample_size; this->len = len; } void display() { - int64_t *sample_keys = new int64_t[len]; + uint64_t *sample_keys = new uint64_t[len]; VLOG(0) << "device_id " << gpu_id << " sample_size = " << sample_size; - VLOG(0) << "there are " << len << " keys "; + VLOG(0) << "there are " << len << " keys to sample for graph " << table_idx; std::string key_str; - cudaMemcpy(sample_keys, key, len * sizeof(int64_t), cudaMemcpyDeviceToHost); + cudaMemcpy(sample_keys, src_nodes, len * sizeof(uint64_t), + cudaMemcpyDeviceToHost); for (int i = 0; i < len; i++) { if (key_str.size() > 0) key_str += ";"; @@ -148,14 +153,14 @@ struct NeighborSampleQuery { } }; struct NeighborSampleResult { - int64_t *val; - int64_t *actual_val; + uint64_t *val; + uint64_t *actual_val; int *actual_sample_size, sample_size, key_size; int total_sample_size; std::shared_ptr val_mem, actual_sample_size_mem; std::shared_ptr actual_val_mem; - int64_t *get_val() { return val; } - int64_t get_actual_val() { return (int64_t)actual_val; } + uint64_t *get_val() { return val; } + uint64_t get_actual_val() { return (uint64_t)actual_val; } int *get_actual_sample_size() { return actual_sample_size; } int get_sample_size() { return sample_size; } int get_key_size() { return key_size; } @@ -167,16 +172,16 @@ struct NeighborSampleResult { platform::CUDADeviceGuard guard(dev_id); platform::CUDAPlace place = platform::CUDAPlace(dev_id); val_mem = - memory::AllocShared(place, _sample_size * _key_size * sizeof(int64_t)); - val = (int64_t *)val_mem->ptr(); + memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t)); + val = (uint64_t *)val_mem->ptr(); actual_sample_size_mem = memory::AllocShared(place, _key_size * sizeof(int)); actual_sample_size = (int *)actual_sample_size_mem->ptr(); } void display() { VLOG(0) << "in node sample result display ------------------"; - int64_t *res = new int64_t[sample_size * key_size]; - cudaMemcpy(res, val, sample_size * key_size * sizeof(int64_t), + uint64_t *res = new uint64_t[sample_size * key_size]; + cudaMemcpy(res, val, sample_size * key_size * sizeof(uint64_t), cudaMemcpyDeviceToHost); int *ac_size = new int[key_size]; cudaMemcpy(ac_size, actual_sample_size, key_size * sizeof(int), @@ -185,8 +190,8 @@ struct NeighborSampleResult { for (int i = 0; i < key_size; i++) { total_sample_size += ac_size[i]; } - int64_t *res2 = new int64_t[total_sample_size]; // r - cudaMemcpy(res2, actual_val, total_sample_size * sizeof(int64_t), + uint64_t *res2 = new uint64_t[total_sample_size]; // r + cudaMemcpy(res2, actual_val, total_sample_size * sizeof(uint64_t), cudaMemcpyDeviceToHost); // r int start = 0; @@ -208,13 +213,13 @@ struct NeighborSampleResult { delete[] ac_size; VLOG(0) << " ------------------"; } - std::vector get_sampled_graph(NeighborSampleQuery q) { - std::vector graph; + std::vector get_sampled_graph(NeighborSampleQuery q) { + std::vector graph; int64_t *sample_keys = new int64_t[q.len]; std::string key_str; - cudaMemcpy(sample_keys, q.key, q.len * sizeof(int64_t), + cudaMemcpy(sample_keys, q.src_nodes, q.len * sizeof(uint64_t), cudaMemcpyDeviceToHost); - int64_t *res = new int64_t[sample_size * key_size]; + uint64_t *res = new uint64_t[sample_size * key_size]; cudaMemcpy(res, val, sample_size * key_size * sizeof(int64_t), cudaMemcpyDeviceToHost); int *ac_size = new int[key_size]; @@ -224,8 +229,8 @@ struct NeighborSampleResult { for (int i = 0; i < key_size; i++) { total_sample_size += ac_size[i]; } - int64_t *res2 = new int64_t[total_sample_size]; // r - cudaMemcpy(res2, actual_val, total_sample_size * sizeof(int64_t), + uint64_t *res2 = new uint64_t[total_sample_size]; // r + cudaMemcpy(res2, actual_val, total_sample_size * sizeof(uint64_t), cudaMemcpyDeviceToHost); // r int start = 0; @@ -248,24 +253,24 @@ struct NeighborSampleResult { }; struct NodeQueryResult { - int64_t *val; + uint64_t *val; int actual_sample_size; - int64_t get_val() { return (int64_t)val; } + uint64_t get_val() { return (uint64_t)val; } int get_len() { return actual_sample_size; } std::shared_ptr val_mem; void initialize(int query_size, int dev_id) { platform::CUDADeviceGuard guard(dev_id); platform::CUDAPlace place = platform::CUDAPlace(dev_id); - val_mem = memory::AllocShared(place, query_size * sizeof(int64_t)); - val = (int64_t *)val_mem->ptr(); + val_mem = memory::AllocShared(place, query_size * sizeof(uint64_t)); + val = (uint64_t *)val_mem->ptr(); // cudaMalloc((void **)&val, query_size * sizeof(int64_t)); actual_sample_size = 0; } void display() { VLOG(0) << "in node query result display ------------------"; - int64_t *res = new int64_t[actual_sample_size]; - cudaMemcpy(res, val, actual_sample_size * sizeof(int64_t), + uint64_t *res = new uint64_t[actual_sample_size]; + cudaMemcpy(res, val, actual_sample_size * sizeof(uint64_t), cudaMemcpyDeviceToHost); VLOG(0) << "actual_sample_size =" << actual_sample_size; @@ -283,7 +288,71 @@ struct NodeQueryResult { actual_sample_size = 0; }; ~NodeQueryResult() {} +}; // end of struct NodeQueryResult + +struct GpuPsGraphFeaNode { + uint64_t node_id; + uint64_t feature_size, feature_offset; + // this node's feature is stored on [feature_offset,feature_offset + + // feature_size) of int64_t *feature_list; }; -} -}; + +struct GpuPsCommGraphFea { + uint64_t *feature_list; + uint8_t *slot_id_list; + GpuPsGraphFeaNode *node_list; + uint64_t feature_size, node_size; + // the size of feature array and graph_node_list array + GpuPsCommGraphFea() + : feature_list(NULL), + slot_id_list(NULL), + node_list(NULL), + feature_size(0), + node_size(0) {} + GpuPsCommGraphFea(uint64_t *feature_list_, uint8_t *slot_id_list_, + GpuPsGraphFeaNode *node_list_, uint64_t feature_size_, + uint64_t node_size_) + : feature_list(feature_list_), + slot_id_list(slot_id_list_), + node_list(node_list_), + feature_size(feature_size_), + node_size(node_size_) {} + void init_on_cpu(uint64_t feature_size, uint64_t node_size, + uint32_t slot_num) { + PADDLE_ENFORCE_LE(slot_num, 255); + this->feature_size = feature_size; + this->node_size = node_size; + this->feature_list = new uint64_t[feature_size]; + this->slot_id_list = new uint8_t[feature_size]; + this->node_list = new GpuPsGraphFeaNode[node_size]; + } + void release_on_cpu() { + delete[] feature_list; + delete[] slot_id_list; + delete[] node_list; + } + void display_on_cpu() { + VLOG(1) << "feature_size = " << feature_size; + VLOG(1) << "node_size = " << node_size; + for (uint64_t i = 0; i < feature_size; i++) { + VLOG(1) << "feature_list[" << i << "] = " << feature_list[i]; + } + for (uint64_t i = 0; i < node_size; i++) { + VLOG(1) << "node_id[" << node_list[i].node_id + << "] feature_size = " << node_list[i].feature_size; + std::string str; + int offset = node_list[i].feature_offset; + for (uint64_t j = 0; j < node_list[i].feature_size; j++) { + if (j > 0) str += ","; + str += std::to_string(slot_id_list[j + offset]); + str += ":"; + str += std::to_string(feature_list[j + offset]); + } + VLOG(1) << str; + } + } +}; // end of struct GpuPsCommGraphFea + +} // end of namespace framework +} // end of namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h new file mode 100644 index 00000000000000..e63043e414bbe9 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace paddle { +namespace framework { + +#include +#include +#include +#include +#include "paddle/fluid/platform/enforce.h" + +inline void debug_gpu_memory_info(const char* desc) { + int device_num = 0; + auto err = cudaGetDeviceCount(&device_num); + PADDLE_ENFORCE_EQ(err, cudaSuccess, + platform::errors::InvalidArgument("cudaGetDeviceCount failed!")); + + size_t avail{0}; + size_t total{0}; + for (int i = 0; i < device_num; ++i) { + cudaSetDevice(i); + auto err = cudaMemGetInfo(&avail, &total); + PADDLE_ENFORCE_EQ(err, cudaSuccess, + platform::errors::InvalidArgument("cudaMemGetInfo failed!")); + VLOG(0) << "update gpu memory on device " << i << ", " + << "avail=" << avail/1024.0/1024.0/1024.0 << "g, " + << "total=" << total/1024.0/1024.0/1024.0 << "g, " + << "use_rate=" << (total-avail)/double(total) << "%, " + << "desc=" << desc; + } +} + +}; // namespace framework +}; // namespace paddle + diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index ae57c2ebe932f8..c4231cb7beb8b0 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -23,19 +23,38 @@ #ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { +enum GraphTableType { EDGE_TABLE, FEATURE_TABLE }; class GpuPsGraphTable : public HeterComm { public: - GpuPsGraphTable(std::shared_ptr resource, int topo_aware) + int get_table_offset(int gpu_id, GraphTableType type, int idx) const { + int type_id = type; + return gpu_id * (graph_table_num_ + feature_table_num_) + + type_id * graph_table_num_ + idx; + } + GpuPsGraphTable(std::shared_ptr resource, int topo_aware, + int graph_table_num) : HeterComm(1, resource) { load_factor_ = 0.25; rw_lock.reset(new pthread_rwlock_t()); + this->graph_table_num_ = graph_table_num; + this->feature_table_num_ = 1; gpu_num = resource_->total_device(); memset(global_device_map, -1, sizeof(global_device_map)); + for (auto &table : tables_) { + delete table; + table = NULL; + } + int feature_table_num = 1; + tables_ = std::vector( + gpu_num * (graph_table_num + feature_table_num), NULL); for (int i = 0; i < gpu_num; i++) { - gpu_graph_list.push_back(GpuPsCommGraph()); global_device_map[resource_->dev_id(i)] = i; - sample_status.push_back(NULL); - tables_.push_back(NULL); + for (int j = 0; j < graph_table_num; j++) { + gpu_graph_list_.push_back(GpuPsCommGraph()); + } + for (int j = 0; j < feature_table_num; j++) { + gpu_graph_fea_list_.push_back(GpuPsCommGraphFea()); + } } cpu_table_status = -1; if (topo_aware) { @@ -89,35 +108,46 @@ class GpuPsGraphTable : public HeterComm { // end_graph_sampling(); // } } - void build_graph_on_single_gpu(GpuPsCommGraph &g, int gpu_id); - void clear_graph_info(int gpu_id); - void build_graph_from_cpu(std::vector &cpu_node_list); + void build_graph_on_single_gpu(GpuPsCommGraph &g, int gpu_id, int idx); + void build_graph_fea_on_single_gpu(GpuPsCommGraphFea &g, int gpu_id); + void clear_graph_info(int gpu_id, int index); + void clear_graph_info(int index); + void clear_feature_info(int gpu_id, int index); + void clear_feature_info(int index); + void build_graph_from_cpu(std::vector &cpu_node_list, + int idx); + void build_graph_fea_from_cpu(std::vector &cpu_node_list, + int idx); NodeQueryResult graph_node_sample(int gpu_id, int sample_size); NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q, bool cpu_switch); - NeighborSampleResult graph_neighbor_sample(int gpu_id, int64_t *key, + NeighborSampleResult graph_neighbor_sample(int gpu_id, uint64_t *key, int sample_size, int len); - NeighborSampleResult graph_neighbor_sample_v2(int gpu_id, int64_t *key, - int sample_size, int len, - bool cpu_query_switch); - void init_sample_status(); - void free_sample_status(); - NodeQueryResult query_node_list(int gpu_id, int start, int query_size); - void clear_graph_info(); + NeighborSampleResult graph_neighbor_sample_v2(int gpu_id, int idx, + uint64_t *key, int sample_size, + int len, bool cpu_query_switch); + + int get_feature_of_nodes(int gpu_id, + std::shared_ptr d_walk, + std::shared_ptr d_offset, int size, int slot_num); + + NodeQueryResult query_node_list(int gpu_id, int idx, int start, + int query_size); void display_sample_res(void *key, void *val, int len, int sample_len); - void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num, + void move_result_to_source_gpu(int gpu_id, int gpu_num, int sample_size, int *h_left, int *h_right, - int64_t *src_sample_res, + uint64_t *src_sample_res, int *actual_sample_size); int init_cpu_table(const paddle::distributed::GraphParameter &graph); int gpu_num; - std::vector gpu_graph_list; + int graph_table_num_, feature_table_num_; + std::vector gpu_graph_list_; + std::vector gpu_graph_fea_list_; int global_device_map[32]; - std::vector sample_status; const int parallel_sample_size = 1; const int dim_y = 256; - std::shared_ptr cpu_graph_table; + std::shared_ptr cpu_graph_table_; std::shared_ptr rw_lock; mutable std::mutex mutex_; std::condition_variable cv_; diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu index 72b9cae41c0fdf..f423a33abe3499 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu @@ -32,8 +32,8 @@ sample_result is to save the neighbor sampling result, its size is len * sample_size; */ -__global__ void get_cpu_id_index(int64_t* key, int* actual_sample_size, - int64_t* cpu_key, int* sum, int* index, +__global__ void get_cpu_id_index(uint64_t* key, int* actual_sample_size, + uint64_t* cpu_key, int* sum, int* index, int len) { CUDA_KERNEL_LOOP(i, len) { if (actual_sample_size[i] == -1) { @@ -46,12 +46,12 @@ __global__ void get_cpu_id_index(int64_t* key, int* actual_sample_size, } __global__ void get_actual_gpu_ac(int* gpu_ac, int number_on_cpu) { - CUDA_KERNEL_LOOP(i, number_on_cpu) { gpu_ac[i] /= sizeof(int64_t); } + CUDA_KERNEL_LOOP(i, number_on_cpu) { gpu_ac[i] /= sizeof(uint64_t); } } template __global__ void copy_buffer_ac_to_final_place( - int64_t* gpu_buffer, int* gpu_ac, int64_t* val, int* actual_sample_size, + uint64_t* gpu_buffer, int* gpu_ac, uint64_t* val, int* actual_sample_size, int* index, int* cumsum_gpu_ac, int number_on_cpu, int sample_size) { assert(blockDim.x == WARP_SIZE); assert(blockDim.y == BLOCK_WARPS); @@ -68,12 +68,43 @@ __global__ void copy_buffer_ac_to_final_place( } } +__global__ void get_features_kernel(GpuPsCommGraphFea graph, int64_t* node_offset_array, + int* actual_size, uint64_t* feature, int slot_num, int n) { + int idx = blockIdx.x * blockDim.y + threadIdx.y; + if (idx < n) { + int node_offset = node_offset_array[idx]; + int offset = idx * slot_num; + if (node_offset == -1) { + for (int k = 0; k < slot_num; ++ k) { + feature[offset + k] = 0; + } + actual_size[idx] = slot_num; + return; + } + + GpuPsGraphFeaNode* node = &(graph.node_list[node_offset]); + uint64_t* feature_start = &(graph.feature_list[node->feature_offset]); + uint8_t* slot_id_start = &(graph.slot_id_list[node->feature_offset]); + int m = 0; + for (int k = 0; k < slot_num; ++k) { + if (m >= node->feature_size || k < slot_id_start[m]) { + feature[offset + k] = 0; + } else if (k == slot_id_start[m]) { + feature[offset + k] = feature_start[m]; + ++m; + } else { + assert(0); + } + } + actual_size[idx] = slot_num; + } +} + template -__global__ void neighbor_sample_example_v2(GpuPsCommGraph graph, - int64_t* node_index, - int* actual_size, int64_t* res, - int sample_len, int n, - int default_value) { +__global__ void neighbor_sample_kernel(GpuPsCommGraph graph, + int64_t* node_index, int* actual_size, + uint64_t* res, int sample_len, int n, + int default_value) { assert(blockDim.x == WARP_SIZE); assert(blockDim.y == BLOCK_WARPS); @@ -91,7 +122,7 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph, int neighbor_len = (int)graph.node_list[node_index[i]].neighbor_size; int64_t data_offset = graph.node_list[node_index[i]].neighbor_offset; int offset = i * sample_len; - int64_t* data = graph.neighbor_list; + uint64_t* data = graph.neighbor_list; if (neighbor_len <= sample_len) { for (int j = threadIdx.x; j < neighbor_len; j += WARP_SIZE) { res[offset + j] = data[data_offset + j]; @@ -120,85 +151,10 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph, } } -__global__ void neighbor_sample_example(GpuPsCommGraph graph, - int64_t* node_index, int* actual_size, - int64_t* res, int sample_len, - int* sample_status, int n, int from) { - int id = blockIdx.x * blockDim.y + threadIdx.y; - if (id < n) { - if (node_index[id] == -1) { - actual_size[id] = 0; - return; - } - curandState rng; - curand_init(blockIdx.x, threadIdx.x, threadIdx.y, &rng); - int64_t index = threadIdx.x; - int64_t offset = id * sample_len; - int64_t* data = graph.neighbor_list; - int64_t data_offset = graph.node_list[node_index[id]].neighbor_offset; - int64_t neighbor_len = graph.node_list[node_index[id]].neighbor_size; - int ac_len; - if (sample_len > neighbor_len) - ac_len = neighbor_len; - else { - ac_len = sample_len; - } - if (4 * ac_len >= 3 * neighbor_len) { - if (index == 0) { - res[offset] = curand(&rng) % (neighbor_len - ac_len + 1); - } - __syncwarp(); - int start = res[offset]; - while (index < ac_len) { - res[offset + index] = data[data_offset + start + index]; - index += blockDim.x; - } - actual_size[id] = ac_len; - } else { - while (index < ac_len) { - int num = curand(&rng) % neighbor_len; - int* addr = sample_status + data_offset + num; - int expected = *addr; - if (!(expected & (1 << from))) { - int old = atomicCAS(addr, expected, expected | (1 << from)); - if (old == expected) { - res[offset + index] = num; - index += blockDim.x; - } - } - } - __syncwarp(); - index = threadIdx.x; - while (index < ac_len) { - int* addr = sample_status + data_offset + res[offset + index]; - int expected, old = *addr; - do { - expected = old; - old = atomicCAS(addr, expected, expected & (~(1 << from))); - } while (old != expected); - res[offset + index] = data[data_offset + res[offset + index]]; - index += blockDim.x; - } - actual_size[id] = ac_len; - } - } - // const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - // if (i < n) { - // auto node_index = index[i]; - // actual_size[i] = graph.node_list[node_index].neighbor_size < sample_size - // ? graph.node_list[node_index].neighbor_size - // : sample_size; - // int offset = graph.node_list[node_index].neighbor_offset; - // for (int j = 0; j < actual_size[i]; j++) { - // sample_result[sample_size * i + j] = graph.neighbor_list[offset + j]; - // } - // } -} - int GpuPsGraphTable::init_cpu_table( const paddle::distributed::GraphParameter& graph) { - cpu_graph_table.reset(new paddle::distributed::GraphTable); - cpu_table_status = cpu_graph_table->Initialize(graph); + cpu_graph_table_.reset(new paddle::distributed::GraphTable); + cpu_table_status = cpu_graph_table_->Initialize(graph); // if (cpu_table_status != 0) return cpu_table_status; // std::function&)> callback = // [this](std::vector& res) { @@ -212,17 +168,6 @@ int GpuPsGraphTable::init_cpu_table( return cpu_table_status; } -// int GpuPsGraphTable::load(const std::string& path, const std::string& param) -// { -// int status = cpu_graph_table->load(path, param); -// if (status != 0) { -// return status; -// } -// std::unique_lock lock(mutex_); -// cpu_graph_table->start_graph_sampling(); -// cv_.wait(lock); -// return 0; -// } /* comment 1 gpu i triggers a neighbor_sample task, @@ -246,30 +191,32 @@ int GpuPsGraphTable::init_cpu_table( void GpuPsGraphTable::display_sample_res(void* key, void* val, int len, int sample_len) { - char key_buffer[len * sizeof(int64_t)]; + char key_buffer[len * sizeof(uint64_t)]; char val_buffer[sample_len * sizeof(int64_t) * len + - (len + len % 2) * sizeof(int) + len * sizeof(int64_t)]; - cudaMemcpy(key_buffer, key, sizeof(int64_t) * len, cudaMemcpyDeviceToHost); + (len + len % 2) * sizeof(int) + len * sizeof(uint64_t)]; + cudaMemcpy(key_buffer, key, sizeof(uint64_t) * len, cudaMemcpyDeviceToHost); cudaMemcpy(val_buffer, val, sample_len * sizeof(int64_t) * len + - (len + len % 2) * sizeof(int) + len * sizeof(int64_t), + (len + len % 2) * sizeof(int) + len * sizeof(uint64_t), cudaMemcpyDeviceToHost); - int64_t* sample_val = (int64_t*)(val_buffer + (len + len % 2) * sizeof(int) + - len * sizeof(int64_t)); + uint64_t* sample_val = + (uint64_t*)(val_buffer + (len + len % 2) * sizeof(int) + + len * sizeof(int64_t)); for (int i = 0; i < len; i++) { - printf("key %lld\n", *(int64_t*)(key_buffer + i * sizeof(int64_t))); - printf("index %lld\n", *(int64_t*)(val_buffer + i * sizeof(int64_t))); + printf("key %llu\n", *(int64_t*)(key_buffer + i * sizeof(uint64_t))); + printf("index %llu\n", *(int64_t*)(val_buffer + i * sizeof(uint64_t))); int ac_size = *(int*)(val_buffer + i * sizeof(int) + len * sizeof(int64_t)); printf("sampled %d neigbhors\n", ac_size); for (int j = 0; j < ac_size; j++) { - printf("%lld ", sample_val[i * sample_len + j]); + printf("%llu ", sample_val[i * sample_len + j]); } printf("\n"); } } -void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( + +void GpuPsGraphTable::move_result_to_source_gpu( int start_index, int gpu_num, int sample_size, int* h_left, int* h_right, - int64_t* src_sample_res, int* actual_sample_size) { + uint64_t* src_sample_res, int* actual_sample_size) { int shard_len[gpu_num]; for (int i = 0; i < gpu_num; i++) { if (h_left[i] == -1 || h_right[i] == -1) { @@ -289,7 +236,7 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( reinterpret_cast(src_sample_res + h_left[i] * sample_size), node.val_storage + sizeof(int64_t) * shard_len[i] + sizeof(int) * (shard_len[i] + shard_len[i] % 2), - sizeof(int64_t) * shard_len[i] * sample_size, cudaMemcpyDefault, + sizeof(uint64_t) * shard_len[i] * sample_size, cudaMemcpyDefault, node.out_stream); cudaMemcpyAsync(reinterpret_cast(actual_sample_size + h_left[i]), node.val_storage + sizeof(int64_t) * shard_len[i], @@ -304,115 +251,13 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( cudaStreamSynchronize(node.out_stream); // cudaStreamSynchronize(resource_->remote_stream(i, start_index)); } - /* - std::queue que; - // auto& node = path_[gpu_id][i].nodes_.front(); - // cudaMemcpyAsync( - // reinterpret_cast(src_sample_res + h_left[i] * sample_size), - // node.val_storage + sizeof(int64_t) * shard_len, - // node.val_bytes_len - sizeof(int64_t) * shard_len, cudaMemcpyDefault, - // node.out_stream); - // cudaMemcpyAsync(reinterpret_cast(actual_sample_size + h_left[i]), - // node.val_storage + sizeof(int) * shard_len, - // sizeof(int) * shard_len, cudaMemcpyDefault, - // node.out_stream); - int cur_step = path_[start_index][i].nodes_.size() - 1; - auto& node = path_[start_index][i].nodes_[cur_step]; - if (cur_step == 0) { - // cudaMemcpyAsync(reinterpret_cast(src_val + h_left[i]), - // node.val_storage, node.val_bytes_len, - // cudaMemcpyDefault, - // node.out_stream); - // VLOG(0)<<"copy "<(src_sample_res + h_left[i] * sample_size), - node.val_storage + sizeof(int64_t) * shard_len[i], - node.val_bytes_len - sizeof(int64_t) * shard_len[i], - cudaMemcpyDefault, - node.out_stream); - //resource_->remote_stream(i, start_index)); - cudaMemcpyAsync(reinterpret_cast(actual_sample_size + h_left[i]), - node.val_storage + sizeof(int) * shard_len[i], - sizeof(int) * shard_len[i], cudaMemcpyDefault, - node.out_stream); - //resource_->remote_stream(i, start_index)); - } else { - CopyTask t(&path_[start_index][i], cur_step - 1); - que.push(t); - // VLOG(0)<<"copy "<remote_stream(i, start_index)); - } - } - while (!que.empty()) { - CopyTask& cur_task = que.front(); - que.pop(); - int cur_step = cur_task.step; - if (cur_task.path->nodes_[cur_step].sync) { - cudaStreamSynchronize(cur_task.path->nodes_[cur_step].out_stream); - //cudaStreamSynchronize(resource_->remote_stream(cur_task.path->nodes_.back().gpu_num, - start_index)); - } - if (cur_step > 0) { - CopyTask c(cur_task.path, cur_step - 1); - que.push(c); - cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage, - cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step - 1].val_bytes_len, - cudaMemcpyDefault, - cur_task.path->nodes_[cur_step - 1].out_stream); - //resource_->remote_stream(cur_task.path->nodes_.back().gpu_num, - start_index)); - } else if (cur_step == 0) { - int end_index = cur_task.path->nodes_.back().gpu_num; - // cudaMemcpyAsync(reinterpret_cast(src_val + h_left[end_index]), - // cur_task.path->nodes_[cur_step].val_storage, - // cur_task.path->nodes_[cur_step].val_bytes_len, - // cudaMemcpyDefault, - // cur_task.path->nodes_[cur_step].out_stream); - //VLOG(0)<<"copy "<nodes_[cur_step].gpu_num<< " to - "<(src_sample_res + - h_left[end_index] * sample_size), - cur_task.path->nodes_[cur_step].val_storage + - sizeof(int64_t) * shard_len[end_index], - cur_task.path->nodes_[cur_step].val_bytes_len - - sizeof(int64_t) * shard_len[end_index], - cudaMemcpyDefault, - cur_task.path->nodes_[cur_step].out_stream); - //resource_->remote_stream(cur_task.path->nodes_.back().gpu_num, - start_index)); - cudaMemcpyAsync( - reinterpret_cast(actual_sample_size + h_left[end_index]), - cur_task.path->nodes_[cur_step].val_storage + - sizeof(int) * shard_len[end_index], - sizeof(int) * shard_len[end_index], cudaMemcpyDefault, - cur_task.path->nodes_[cur_step].out_stream); - //resource_->remote_stream(cur_task.path->nodes_.back().gpu_num, - start_index)); - } - } - for (int i = 0; i < gpu_num; ++i) { - if (h_left[i] == -1 || h_right[i] == -1) { - continue; - } - auto& node = path_[start_index][i].nodes_.front(); - cudaStreamSynchronize(node.out_stream); - //cudaStreamSynchronize(resource_->remote_stream(i, start_index)); - } - */ } /* TODO: how to optimize it to eliminate the for loop */ -__global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals, +__global__ void fill_dvalues(uint64_t* d_shard_vals, uint64_t* d_vals, int* d_shard_actual_sample_size, int* d_actual_sample_size, int* idx, int sample_size, int len) { @@ -425,7 +270,18 @@ __global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals, } } -__global__ void fill_actual_vals(int64_t* vals, int64_t* actual_vals, +__global__ void fill_dvalues(uint64_t* d_shard_vals, uint64_t* d_vals, + int* d_shard_actual_sample_size, + int* idx, int sample_size, int len) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + for (int j = 0; j < sample_size; j++) { + d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j]; + } + } +} + +__global__ void fill_actual_vals(uint64_t* vals, uint64_t* actual_vals, int* actual_sample_size, int* cumsum_actual_sample_size, int sample_size, int len) { @@ -438,18 +294,48 @@ __global__ void fill_actual_vals(int64_t* vals, int64_t* actual_vals, } __global__ void node_query_example(GpuPsCommGraph graph, int start, int size, - int64_t* res) { + uint64_t* res) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < size) { res[i] = graph.node_list[start + i].node_id; } } -void GpuPsGraphTable::clear_graph_info(int gpu_id) { - if (tables_.size() && tables_[gpu_id] != NULL) { - delete tables_[gpu_id]; +void GpuPsGraphTable::clear_feature_info(int gpu_id) { + int idx = 0; + if (idx >= feature_table_num_) return; + int offset = get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, idx); + if (offset < tables_.size()) { + delete tables_[offset]; + tables_[offset] = NULL; + } + + int graph_fea_idx = gpu_id * feature_table_num_ + idx; + if (graph_fea_idx >= gpu_graph_fea_list_.size()) { + return; + } + auto& graph = gpu_graph_fea_list_[graph_fea_idx]; + if (graph.feature_list != NULL) { + cudaFree(graph.feature_list); + } + + if (graph.slot_id_list != NULL) { + cudaFree(graph.slot_id_list); + } + + if (graph.node_list != NULL) { + cudaFree(graph.node_list); + } +} + +void GpuPsGraphTable::clear_graph_info(int gpu_id, int idx) { + if (idx >= graph_table_num_) return; + int offset = get_table_offset(gpu_id, GraphTableType::EDGE_TABLE, idx); + if (offset < tables_.size()) { + delete tables_[offset]; + tables_[offset] = NULL; } - auto& graph = gpu_graph_list[gpu_id]; + auto& graph = gpu_graph_list_[gpu_id * graph_table_num_ + idx]; if (graph.neighbor_list != NULL) { cudaFree(graph.neighbor_list); } @@ -457,21 +343,88 @@ void GpuPsGraphTable::clear_graph_info(int gpu_id) { cudaFree(graph.node_list); } } -void GpuPsGraphTable::clear_graph_info() { - if (tables_.size()) { - for (auto table : tables_) delete table; - } - tables_.clear(); - for (auto graph : gpu_graph_list) { - if (graph.neighbor_list != NULL) { - cudaFree(graph.neighbor_list); - } - if (graph.node_list != NULL) { - cudaFree(graph.node_list); +void GpuPsGraphTable::clear_graph_info(int idx) { + for (int i = 0; i < gpu_num; i++) clear_graph_info(i, idx); +} +/* +the parameter std::vector cpu_graph_list is generated by cpu. +it saves the graph to be saved on each gpu. +for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number +== i +In this function, memory is allocated on each gpu to save the graphs, +gpu i saves the ith graph from cpu_graph_list +*/ +void GpuPsGraphTable::build_graph_fea_on_single_gpu(GpuPsCommGraphFea& g, + int gpu_id) { + clear_feature_info(gpu_id); + int ntype_id = 0; + + platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); + + int offset = gpu_id * feature_table_num_ + ntype_id; + gpu_graph_fea_list_[offset] = GpuPsCommGraphFea(); + + int table_offset = + get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, ntype_id); + + size_t capacity = std::max((uint64_t)1, g.node_size) / load_factor_; + tables_[table_offset] = new Table(capacity); + if (g.node_size > 0) { + std::vector keys; + std::vector offsets; + // TODO + cudaMalloc((void**)&gpu_graph_fea_list_[offset].node_list, + g.node_size * sizeof(GpuPsGraphFeaNode)); + cudaMemcpy(gpu_graph_fea_list_[offset].node_list, g.node_list, + g.node_size * sizeof(GpuPsGraphFeaNode), cudaMemcpyHostToDevice); + for (int64_t j = 0; j < g.node_size; j++) { + keys.push_back(g.node_list[j].node_id); + offsets.push_back(j); } + build_ps(gpu_id, keys.data(), offsets.data(), keys.size(), 1024, 8, + table_offset); + gpu_graph_fea_list_[offset].node_size = g.node_size; + } else { + build_ps(gpu_id, NULL, NULL, 0, 1024, 8, table_offset); + gpu_graph_fea_list_[offset].node_list = NULL; + gpu_graph_fea_list_[offset].node_size = 0; + } + if (g.feature_size) { + // TODO + cudaError_t cudaStatus = + cudaMalloc((void**)&gpu_graph_fea_list_[offset].feature_list, + g.feature_size * sizeof(uint64_t)); + PADDLE_ENFORCE_EQ( + cudaStatus, cudaSuccess, + platform::errors::InvalidArgument( + "ailed to allocate memory for graph-feature on gpu ")); + VLOG(0) << "sucessfully allocate " << g.feature_size * sizeof(uint64_t) + << " bytes of memory for graph-feature on gpu " + << resource_->dev_id(gpu_id); + cudaMemcpy(gpu_graph_fea_list_[offset].feature_list, g.feature_list, + g.feature_size * sizeof(uint64_t), cudaMemcpyHostToDevice); + + // TODO + cudaStatus = cudaMalloc((void**)&gpu_graph_fea_list_[offset].slot_id_list, + g.feature_size * sizeof(uint8_t)); + PADDLE_ENFORCE_EQ( + cudaStatus, cudaSuccess, + platform::errors::InvalidArgument( + "ailed to allocate memory for graph-feature on gpu ")); + VLOG(0) << "sucessfully allocate " << g.feature_size * sizeof(uint8_t) + << " bytes of memory for graph-feature on gpu " + << resource_->dev_id(gpu_id); + cudaMemcpy(gpu_graph_fea_list_[offset].slot_id_list, g.slot_id_list, + g.feature_size * sizeof(uint8_t), cudaMemcpyHostToDevice); + + gpu_graph_fea_list_[offset].feature_size = g.feature_size; + } else { + gpu_graph_fea_list_[offset].feature_list = NULL; + gpu_graph_fea_list_[offset].slot_id_list = NULL; + gpu_graph_fea_list_[offset].feature_size = 0; } - gpu_graph_list.clear(); } + /* the parameter std::vector cpu_graph_list is generated by cpu. it saves the graph to be saved on each gpu. @@ -480,118 +433,170 @@ for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number In this function, memory is allocated on each gpu to save the graphs, gpu i saves the ith graph from cpu_graph_list */ - -void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i) { - clear_graph_info(i); +void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i, + int idx) { + clear_graph_info(i, idx); platform::CUDADeviceGuard guard(resource_->dev_id(i)); - // platform::CUDADeviceGuard guard(i); - gpu_graph_list[i] = GpuPsCommGraph(); - sample_status[i] = NULL; - tables_[i] = new Table(std::max((int64_t)1, g.node_size) / load_factor_); + int offset = i * graph_table_num_ + idx; + gpu_graph_list_[offset] = GpuPsCommGraph(); + int table_offset = get_table_offset(i, GraphTableType::EDGE_TABLE, idx); + size_t capacity = std::max((uint64_t)1, (uint64_t)g.node_size) / load_factor_; + tables_[table_offset] = new Table(capacity); if (g.node_size > 0) { - std::vector keys; - std::vector offset; - cudaMalloc((void**)&gpu_graph_list[i].node_list, + std::vector keys; + std::vector offsets; + cudaMalloc((void**)&gpu_graph_list_[offset].node_list, g.node_size * sizeof(GpuPsGraphNode)); - cudaMemcpy(gpu_graph_list[i].node_list, g.node_list, + cudaMemcpy(gpu_graph_list_[offset].node_list, g.node_list, g.node_size * sizeof(GpuPsGraphNode), cudaMemcpyHostToDevice); for (int64_t j = 0; j < g.node_size; j++) { keys.push_back(g.node_list[j].node_id); - offset.push_back(j); + offsets.push_back(j); } - build_ps(i, (uint64_t*)keys.data(), offset.data(), keys.size(), 1024, 8); - gpu_graph_list[i].node_size = g.node_size; + build_ps(i, (uint64_t*)keys.data(), offsets.data(), keys.size(), 1024, 8, + table_offset); + gpu_graph_list_[offset].node_size = g.node_size; } else { - build_ps(i, NULL, NULL, 0, 1024, 8); - gpu_graph_list[i].node_list = NULL; - gpu_graph_list[i].node_size = 0; + build_ps(i, NULL, NULL, 0, 1024, 8, table_offset); + gpu_graph_list_[offset].node_list = NULL; + gpu_graph_list_[offset].node_size = 0; } if (g.neighbor_size) { cudaError_t cudaStatus = - cudaMalloc((void**)&gpu_graph_list[i].neighbor_list, - g.neighbor_size * sizeof(int64_t)); + cudaMalloc((void**)&gpu_graph_list_[offset].neighbor_list, + g.neighbor_size * sizeof(uint64_t)); PADDLE_ENFORCE_EQ(cudaStatus, cudaSuccess, platform::errors::InvalidArgument( "ailed to allocate memory for graph on gpu ")); - VLOG(0) << "sucessfully allocate " << g.neighbor_size * sizeof(int64_t) + VLOG(0) << "sucessfully allocate " << g.neighbor_size * sizeof(uint64_t) << " bytes of memory for graph-edges on gpu " << resource_->dev_id(i); - cudaMemcpy(gpu_graph_list[i].neighbor_list, g.neighbor_list, - g.neighbor_size * sizeof(int64_t), cudaMemcpyHostToDevice); - gpu_graph_list[i].neighbor_size = g.neighbor_size; + cudaMemcpy(gpu_graph_list_[offset].neighbor_list, g.neighbor_list, + g.neighbor_size * sizeof(uint64_t), cudaMemcpyHostToDevice); + gpu_graph_list_[offset].neighbor_size = g.neighbor_size; } else { - gpu_graph_list[i].neighbor_list = NULL; - gpu_graph_list[i].neighbor_size = 0; + gpu_graph_list_[offset].neighbor_list = NULL; + gpu_graph_list_[offset].neighbor_size = 0; } } -void GpuPsGraphTable::init_sample_status() { - for (int i = 0; i < gpu_num; i++) { - if (gpu_graph_list[i].neighbor_size) { - platform::CUDADeviceGuard guard(resource_->dev_id(i)); - int* addr; - cudaMalloc((void**)&addr, gpu_graph_list[i].neighbor_size * sizeof(int)); - cudaMemset(addr, 0, gpu_graph_list[i].neighbor_size * sizeof(int)); - sample_status[i] = addr; - } - } -} +void GpuPsGraphTable::build_graph_fea_from_cpu( + std::vector& cpu_graph_fea_list, int ntype_id) { + PADDLE_ENFORCE_EQ( + cpu_graph_fea_list.size(), resource_->total_device(), + platform::errors::InvalidArgument("the cpu node list size doesn't match " + "the number of gpu on your machine.")); + clear_feature_info(ntype_id); + for (int i = 0; i < cpu_graph_fea_list.size(); i++) { + int table_offset = + get_table_offset(i, GraphTableType::FEATURE_TABLE, ntype_id); + int offset = i * feature_table_num_ + ntype_id; + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + gpu_graph_fea_list_[offset] = GpuPsCommGraphFea(); + tables_[table_offset] = new Table( + std::max((uint64_t)1, (uint64_t)cpu_graph_fea_list[i].node_size) / + load_factor_); + if (cpu_graph_fea_list[i].node_size > 0) { + std::vector keys; + std::vector offsets; + // TODO + cudaMalloc((void**)&gpu_graph_fea_list_[offset].node_list, + cpu_graph_fea_list[i].node_size * sizeof(GpuPsGraphNode)); + cudaMemcpy(gpu_graph_fea_list_[offset].node_list, + cpu_graph_fea_list[i].node_list, + cpu_graph_fea_list[i].node_size * sizeof(GpuPsGraphNode), + cudaMemcpyHostToDevice); + for (int64_t j = 0; j < cpu_graph_fea_list[i].node_size; j++) { + keys.push_back(cpu_graph_fea_list[i].node_list[j].node_id); + offsets.push_back(j); + } + build_ps(i, (uint64_t*)(keys.data()), offsets.data(), keys.size(), 1024, + 8, table_offset); + gpu_graph_fea_list_[offset].node_size = cpu_graph_fea_list[i].node_size; + } else { + build_ps(i, NULL, NULL, 0, 1024, 8, table_offset); + gpu_graph_fea_list_[offset].node_list = NULL; + gpu_graph_fea_list_[offset].node_size = 0; + } + if (cpu_graph_fea_list[i].feature_size) { + // TODO + cudaMalloc((void**)&gpu_graph_fea_list_[offset].feature_list, + cpu_graph_fea_list[i].feature_size * sizeof(uint64_t)); + + cudaMemcpy(gpu_graph_fea_list_[offset].feature_list, + cpu_graph_fea_list[i].feature_list, + cpu_graph_fea_list[i].feature_size * sizeof(uint64_t), + cudaMemcpyHostToDevice); -void GpuPsGraphTable::free_sample_status() { - for (int i = 0; i < gpu_num; i++) { - if (sample_status[i] != NULL) { - platform::CUDADeviceGuard guard(resource_->dev_id(i)); - cudaFree(sample_status[i]); + // TODO + cudaMalloc((void**)&gpu_graph_fea_list_[offset].slot_id_list, + cpu_graph_fea_list[i].feature_size * sizeof(uint8_t)); + + cudaMemcpy(gpu_graph_fea_list_[offset].slot_id_list, + cpu_graph_fea_list[i].slot_id_list, + cpu_graph_fea_list[i].feature_size * sizeof(uint8_t), + cudaMemcpyHostToDevice); + + gpu_graph_fea_list_[offset].feature_size = + cpu_graph_fea_list[i].feature_size; + } else { + gpu_graph_fea_list_[offset].feature_list = NULL; + gpu_graph_fea_list_[offset].slot_id_list = NULL; + gpu_graph_fea_list_[offset].feature_size = 0; } } + cudaDeviceSynchronize(); } + void GpuPsGraphTable::build_graph_from_cpu( - std::vector& cpu_graph_list) { + std::vector& cpu_graph_list, int idx) { VLOG(0) << "in build_graph_from_cpu cpu_graph_list size = " << cpu_graph_list.size(); PADDLE_ENFORCE_EQ( cpu_graph_list.size(), resource_->total_device(), platform::errors::InvalidArgument("the cpu node list size doesn't match " "the number of gpu on your machine.")); - clear_graph_info(); + clear_graph_info(idx); for (int i = 0; i < cpu_graph_list.size(); i++) { + int table_offset = get_table_offset(i, GraphTableType::EDGE_TABLE, idx); + int offset = i * graph_table_num_ + idx; platform::CUDADeviceGuard guard(resource_->dev_id(i)); - gpu_graph_list[i] = GpuPsCommGraph(); - sample_status[i] = NULL; - tables_[i] = new Table(std::max((int64_t)1, cpu_graph_list[i].node_size) / - load_factor_); + gpu_graph_list_[offset] = GpuPsCommGraph(); + tables_[table_offset] = + new Table(std::max((uint64_t)1, (uint64_t)cpu_graph_list[i].node_size) / + load_factor_); if (cpu_graph_list[i].node_size > 0) { - std::vector keys; - std::vector offset; - cudaMalloc((void**)&gpu_graph_list[i].node_list, + std::vector keys; + std::vector offsets; + cudaMalloc((void**)&gpu_graph_list_[offset].node_list, cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode)); - cudaMemcpy(gpu_graph_list[i].node_list, cpu_graph_list[i].node_list, + cudaMemcpy(gpu_graph_list_[offset].node_list, cpu_graph_list[i].node_list, cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode), cudaMemcpyHostToDevice); for (int64_t j = 0; j < cpu_graph_list[i].node_size; j++) { keys.push_back(cpu_graph_list[i].node_list[j].node_id); - offset.push_back(j); + offsets.push_back(j); } - build_ps(i, (uint64_t*)(keys.data()), offset.data(), keys.size(), 1024, - 8); - gpu_graph_list[i].node_size = cpu_graph_list[i].node_size; + build_ps(i, (uint64_t*)(keys.data()), offsets.data(), keys.size(), 1024, + 8, table_offset); + gpu_graph_list_[offset].node_size = cpu_graph_list[i].node_size; } else { - build_ps(i, NULL, NULL, 0, 1024, 8); - gpu_graph_list[i].node_list = NULL; - gpu_graph_list[i].node_size = 0; + build_ps(i, NULL, NULL, 0, 1024, 8, table_offset); + gpu_graph_list_[offset].node_list = NULL; + gpu_graph_list_[offset].node_size = 0; } if (cpu_graph_list[i].neighbor_size) { - cudaMalloc((void**)&gpu_graph_list[i].neighbor_list, - cpu_graph_list[i].neighbor_size * sizeof(int64_t)); + cudaMalloc((void**)&gpu_graph_list_[offset].neighbor_list, + cpu_graph_list[i].neighbor_size * sizeof(uint64_t)); - cudaMemcpy(gpu_graph_list[i].neighbor_list, + cudaMemcpy(gpu_graph_list_[offset].neighbor_list, cpu_graph_list[i].neighbor_list, - cpu_graph_list[i].neighbor_size * sizeof(int64_t), + cpu_graph_list[i].neighbor_size * sizeof(uint64_t), cudaMemcpyHostToDevice); - gpu_graph_list[i].neighbor_size = cpu_graph_list[i].neighbor_size; + gpu_graph_list_[offset].neighbor_size = cpu_graph_list[i].neighbor_size; } else { - gpu_graph_list[i].neighbor_list = NULL; - gpu_graph_list[i].neighbor_size = 0; + gpu_graph_list_[offset].neighbor_list = NULL; + gpu_graph_list_[offset].neighbor_size = 0; } } cudaDeviceSynchronize(); @@ -599,174 +604,21 @@ void GpuPsGraphTable::build_graph_from_cpu( NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v3( NeighborSampleQuery q, bool cpu_switch) { - return graph_neighbor_sample_v2(global_device_map[q.gpu_id], q.key, - q.sample_size, q.len, cpu_switch); + return graph_neighbor_sample_v2(global_device_map[q.gpu_id], q.table_idx, + q.src_nodes, q.sample_size, q.len, + cpu_switch); } + NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id, - int64_t* key, + uint64_t* key, int sample_size, int len) { - /* - comment 2 - this function shares some kernels with heter_comm_inl.h - arguments definitions: - gpu_id:the id of gpu. - len:how many keys are used,(the length of array key) - sample_size:how many neighbors should be sampled for each node in key. - the code below shuffle the key array to make the keys - that belong to a gpu-card stay together, - the shuffled result is saved on d_shard_keys, - if ith element in d_shard_keys_ptr is - from jth element in the original key array, then idx[i] = j, - idx could be used to recover the original array. - if keys in range [a,b] belong to ith-gpu, then h_left[i] = a, h_right[i] = - b, - if no keys are allocated for ith-gpu, then h_left[i] == h_right[i] == -1 - for example, suppose key = [0,1,2,3,4,5,6,7,8], gpu_num = 2 - when we run this neighbor_sample function, - the key is shuffled to [0,2,4,6,8,1,3,5,7] - the first part (0,2,4,6,8) % 2 == 0,thus should be handled by gpu 0, - the rest part should be handled by gpu1, because (1,3,5,7) % 2 == 1, - h_left = [0,5],h_right = [4,8] - */ - - NeighborSampleResult result; - result.initialize(sample_size, len, resource_->dev_id(gpu_id)); - if (len == 0) { - return result; - } - platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id)); - platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); - int* actual_sample_size = result.actual_sample_size; - int64_t* val = result.val; - int total_gpu = resource_->total_device(); - auto stream = resource_->local_stream(gpu_id, 0); - - int grid_size = (len - 1) / block_size_ + 1; - - int h_left[total_gpu]; // NOLINT - int h_right[total_gpu]; // NOLINT - - auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); - auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); - int* d_left_ptr = reinterpret_cast(d_left->ptr()); - int* d_right_ptr = reinterpret_cast(d_right->ptr()); - - cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream); - cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream); - // - auto d_idx = memory::Alloc(place, len * sizeof(int)); - int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - - auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t)); - int64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); - auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t)); - int64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); - auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int)); - int* d_shard_actual_sample_size_ptr = - reinterpret_cast(d_shard_actual_sample_size->ptr()); - - split_input_to_shard((uint64_t*)(key), d_idx_ptr, len, d_left_ptr, - d_right_ptr, gpu_id); - - heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, key, d_idx_ptr, len, - stream); - cudaStreamSynchronize(stream); - - cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int), - cudaMemcpyDeviceToHost); - cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int), - cudaMemcpyDeviceToHost); - // auto start1 = std::chrono::steady_clock::now(); - for (int i = 0; i < total_gpu; ++i) { - int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; - if (shard_len == 0) { - continue; - } - /* - comment 3 - shard_len denotes the size of keys on i-th gpu here, - when we sample on i-th gpu, we allocate shard_len * (1 + sample_size) - int64_t units - of memory, we use alloc_mem_i to denote it, the range [0,shard_len) is saved - for the respective nodes' indexes - and acutal sample_size. - with nodes' indexes we could get the nodes to sample. - since size of int64_t is 8 bits, while size of int is 4, - the range of [0,shard_len) contains shard_len * 2 int uinits; - The values of the first half of this range will be updated by - the k-v map on i-th-gpu. - The second half of this range is saved for actual sample size of each node. - For node x, - its sampling result is saved on the range - [shard_len + sample_size * x,shard_len + sample_size * x + - actual_sample_size_of_x) - of alloc_mem_i, actual_sample_size_of_x equals ((int - *)alloc_mem_i)[shard_len + x] - */ - - create_storage(gpu_id, i, shard_len * sizeof(int64_t), - shard_len * (1 + sample_size) * sizeof(int64_t) + - sizeof(int) * (shard_len + shard_len % 2)); - // auto& node = path_[gpu_id][i].nodes_[0]; - } - walk_to_dest(gpu_id, total_gpu, h_left, h_right, - (uint64_t*)(d_shard_keys_ptr), NULL); - - for (int i = 0; i < total_gpu; ++i) { - if (h_left[i] == -1) { - continue; - } - int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; - auto& node = path_[gpu_id][i].nodes_.back(); - cudaMemsetAsync(node.val_storage, -1, shard_len * sizeof(int64_t), - node.in_stream); - cudaStreamSynchronize(node.in_stream); - platform::CUDADeviceGuard guard(resource_->dev_id(i)); - tables_[i]->get(reinterpret_cast(node.key_storage), - reinterpret_cast(node.val_storage), - h_right[i] - h_left[i] + 1, - resource_->remote_stream(i, gpu_id)); - // node.in_stream); - auto graph = gpu_graph_list[i]; - int64_t* id_array = reinterpret_cast(node.val_storage); - int* actual_size_array = (int*)(id_array + shard_len); - int64_t* sample_array = - (int64_t*)(actual_size_array + shard_len + shard_len % 2); - int sample_grid_size = (shard_len - 1) / dim_y + 1; - dim3 block(parallel_sample_size, dim_y); - dim3 grid(sample_grid_size); - neighbor_sample_example<<remote_stream(i, gpu_id)>>>( - graph, id_array, actual_size_array, sample_array, sample_size, - sample_status[i], shard_len, gpu_id); - } - - for (int i = 0; i < total_gpu; ++i) { - if (h_left[i] == -1) { - continue; - } - cudaStreamSynchronize(resource_->remote_stream(i, gpu_id)); - } - move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size, - h_left, h_right, d_shard_vals_ptr, - d_shard_actual_sample_size_ptr); - fill_dvalues<<>>( - d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size, - d_idx_ptr, sample_size, len); - for (int i = 0; i < total_gpu; ++i) { - int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; - if (shard_len == 0) { - continue; - } - destroy_storage(gpu_id, i); - } - cudaStreamSynchronize(stream); - return result; + return graph_neighbor_sample_v2(gpu_id, 0, key, sample_size, len, false); } NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( - int gpu_id, int64_t* key, int sample_size, int len, bool cpu_query_switch) { + int gpu_id, int idx, uint64_t* key, int sample_size, int len, + bool cpu_query_switch) { NeighborSampleResult result; result.initialize(sample_size, len, resource_->dev_id(gpu_id)); @@ -777,7 +629,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id)); platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); int* actual_sample_size = result.actual_sample_size; - int64_t* val = result.val; + uint64_t* val = result.val; int total_gpu = resource_->total_device(); auto stream = resource_->local_stream(gpu_id, 0); @@ -801,10 +653,11 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( auto d_idx = memory::Alloc(place, len * sizeof(int)); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t)); - int64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); - auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t)); - int64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); + auto d_shard_keys = memory::Alloc(place, len * sizeof(uint64_t)); + uint64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); + auto d_shard_vals = + memory::Alloc(place, sample_size * len * sizeof(uint64_t)); + uint64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int)); int* d_shard_actual_sample_size_ptr = reinterpret_cast(d_shard_actual_sample_size->ptr()); @@ -826,8 +679,9 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( if (shard_len == 0) { continue; } - create_storage(gpu_id, i, shard_len * sizeof(int64_t), - shard_len * (1 + sample_size) * sizeof(int64_t) + + create_storage(gpu_id, i, shard_len * sizeof(uint64_t), + shard_len * sample_size * sizeof(uint64_t) + + shard_len * sizeof(int64_t) + sizeof(int) * (shard_len + shard_len % 2)); } walk_to_dest(gpu_id, total_gpu, h_left, h_right, @@ -844,22 +698,24 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( cudaStreamSynchronize(node.in_stream); platform::CUDADeviceGuard guard(resource_->dev_id(i)); // If not found, val is -1. - tables_[i]->get(reinterpret_cast(node.key_storage), - reinterpret_cast(node.val_storage), - h_right[i] - h_left[i] + 1, - resource_->remote_stream(i, gpu_id)); - - auto graph = gpu_graph_list[i]; + int table_offset = get_table_offset(i, GraphTableType::EDGE_TABLE, idx); + int offset = i * graph_table_num_ + idx; + tables_[table_offset]->get(reinterpret_cast(node.key_storage), + reinterpret_cast(node.val_storage), + h_right[i] - h_left[i] + 1, + resource_->remote_stream(i, gpu_id)); + + auto graph = gpu_graph_list_[offset]; int64_t* id_array = reinterpret_cast(node.val_storage); int* actual_size_array = (int*)(id_array + shard_len); - int64_t* sample_array = - (int64_t*)(actual_size_array + shard_len + shard_len % 2); + uint64_t* sample_array = + (uint64_t*)(actual_size_array + shard_len + shard_len % 2); constexpr int WARP_SIZE = 32; constexpr int BLOCK_WARPS = 128 / WARP_SIZE; constexpr int TILE_SIZE = BLOCK_WARPS * 16; const dim3 block(WARP_SIZE, BLOCK_WARPS); const dim3 grid((shard_len + TILE_SIZE - 1) / TILE_SIZE); - neighbor_sample_example_v2< + neighbor_sample_kernel< WARP_SIZE, BLOCK_WARPS, TILE_SIZE><<remote_stream(i, gpu_id)>>>( graph, id_array, actual_size_array, sample_array, sample_size, @@ -872,8 +728,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( } cudaStreamSynchronize(resource_->remote_stream(i, gpu_id)); } - - move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size, + move_result_to_source_gpu(gpu_id, total_gpu, sample_size, h_left, h_right, d_shard_vals_ptr, d_shard_actual_sample_size_ptr); fill_dvalues<<>>( @@ -884,7 +739,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( if (cpu_query_switch) { // Get cpu keys and corresponding position. - thrust::device_vector t_cpu_keys(len); + thrust::device_vector t_cpu_keys(len); thrust::device_vector t_index(len + 1, 0); get_cpu_id_index<<>>( key, actual_sample_size, thrust::raw_pointer_cast(t_cpu_keys.data()), @@ -897,34 +752,34 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( cudaMemcpy(&number_on_cpu, thrust::raw_pointer_cast(t_index.data()), sizeof(int), cudaMemcpyDeviceToHost); if (number_on_cpu > 0) { - int64_t* cpu_keys = new int64_t[number_on_cpu]; + uint64_t* cpu_keys = new uint64_t[number_on_cpu]; cudaMemcpy(cpu_keys, thrust::raw_pointer_cast(t_cpu_keys.data()), - number_on_cpu * sizeof(int64_t), cudaMemcpyDeviceToHost); + number_on_cpu * sizeof(uint64_t), cudaMemcpyDeviceToHost); std::vector> buffers(number_on_cpu); std::vector ac(number_on_cpu); - auto status = cpu_graph_table->random_sample_neighbors( - 0, cpu_keys, sample_size, buffers, ac, false); + auto status = cpu_graph_table_->random_sample_neighbors( + idx, cpu_keys, sample_size, buffers, ac, false); int total_cpu_sample_size = std::accumulate(ac.begin(), ac.end(), 0); - total_cpu_sample_size /= sizeof(int64_t); + total_cpu_sample_size /= sizeof(uint64_t); - // Merge buffers into one int64_t vector. - int64_t* merge_buffers = new int64_t[total_cpu_sample_size]; + // Merge buffers into one uint64_t vector. + uint64_t* merge_buffers = new uint64_t[total_cpu_sample_size]; int start = 0; for (int j = 0; j < number_on_cpu; j++) { - memcpy(merge_buffers + start, (int64_t*)(buffers[j].get()), ac[j]); - start += ac[j] / sizeof(int64_t); + memcpy(merge_buffers + start, (uint64_t*)(buffers[j].get()), ac[j]); + start += ac[j] / sizeof(uint64_t); } // Copy merge_buffers to gpu. - thrust::device_vector gpu_buffers(total_cpu_sample_size); + thrust::device_vector gpu_buffers(total_cpu_sample_size); thrust::device_vector gpu_ac(number_on_cpu); - int64_t* gpu_buffers_ptr = thrust::raw_pointer_cast(gpu_buffers.data()); + uint64_t* gpu_buffers_ptr = thrust::raw_pointer_cast(gpu_buffers.data()); int* gpu_ac_ptr = thrust::raw_pointer_cast(gpu_ac.data()); cudaMemcpyAsync(gpu_buffers_ptr, merge_buffers, - total_cpu_sample_size * sizeof(int64_t), + total_cpu_sample_size * sizeof(uint64_t), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(gpu_ac_ptr, ac.data(), number_on_cpu * sizeof(int), cudaMemcpyHostToDevice, stream); @@ -970,8 +825,8 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( t_actual_sample_size.end()); result.actual_val_mem = - memory::AllocShared(place, total_sample_size * sizeof(int64_t)); - result.actual_val = (int64_t*)(result.actual_val_mem)->ptr(); + memory::AllocShared(place, total_sample_size * sizeof(uint64_t)); + result.actual_val = (uint64_t*)(result.actual_val_mem)->ptr(); result.set_total_sample_size(total_sample_size); @@ -1001,7 +856,7 @@ NodeQueryResult GpuPsGraphTable::graph_node_sample(int gpu_id, return NodeQueryResult(); } -NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start, +NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int idx, int start, int query_size) { NodeQueryResult result; if (query_size <= 0) return result; @@ -1009,24 +864,8 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start, actual_size = 0; // int dev_id = resource_->dev_id(gpu_id); // platform::CUDADeviceGuard guard(dev_id); - std::vector idx, gpu_begin_pos, local_begin_pos; + std::vector gpu_begin_pos, local_begin_pos; int sample_size; - /* - if idx[i] = a, gpu_begin_pos[i] = p1, - gpu_local_begin_pos[i] = p2; - sample_size[i] = s; - then on gpu a, the nodes of positions [p1,p1 + s) should be returned - and saved from the p2 position on the sample_result array - for example: - suppose - gpu 0 saves [0,2,4,6,8], gpu1 saves [1,3,5,7] - start = 3, query_size = 5 - we know [6,8,1,3,5] should be returned; - idx = [0,1] - gpu_begin_pos = [3,0] - local_begin_pos = [0,3] - sample_size = [2,3] - */ std::function range_check = []( int x, int y, int x1, int y1, int& x2, int& y2) { if (y <= x1 || x >= y1) return 0; @@ -1034,7 +873,7 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start, x2 = max(x1, x); return y2 - x2; }; - auto graph = gpu_graph_list[gpu_id]; + auto graph = gpu_graph_list_[gpu_id]; if (graph.node_size == 0) { return result; } @@ -1044,68 +883,128 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start, if (len == 0) { return result; } - int64_t* val; + uint64_t* val; sample_size = len; result.initialize(len, resource_->dev_id(gpu_id)); actual_size = len; val = result.val; int dev_id_i = resource_->dev_id(gpu_id); platform::CUDADeviceGuard guard(dev_id_i); - // platform::CUDADeviceGuard guard(i); int grid_size = (len - 1) / block_size_ + 1; + int offset = gpu_id * graph_table_num_ + idx; node_query_example<<remote_stream(gpu_id, gpu_id)>>>( - gpu_graph_list[gpu_id], x2, len, (int64_t*)val); + gpu_graph_list_[offset], x2, len, (uint64_t*)val); cudaStreamSynchronize(resource_->remote_stream(gpu_id, gpu_id)); return result; - /* - for (int i = 0; i < gpu_graph_list.size() && query_size != 0; i++) { - auto graph = gpu_graph_list[i]; - if (graph.node_size == 0) { +} + +int GpuPsGraphTable::get_feature_of_nodes(int gpu_id, std::shared_ptr d_nodes, + std::shared_ptr d_feature, int node_num, int slot_num) { + if (node_num == 0) { + return -1; + } + + platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id)); + platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); + int total_gpu = resource_->total_device(); + auto stream = resource_->local_stream(gpu_id, 0); + + auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); + auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); + int* d_left_ptr = reinterpret_cast(d_left->ptr()); + int* d_right_ptr = reinterpret_cast(d_right->ptr()); + + cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream); + cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream); + // + auto d_idx = memory::Alloc(place, node_num * sizeof(int)); + int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); + + auto d_shard_keys = memory::Alloc(place, node_num * sizeof(uint64_t)); + uint64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); + auto d_shard_vals = memory::Alloc(place, slot_num * node_num * sizeof(uint64_t)); + uint64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); + auto d_shard_actual_size = memory::Alloc(place, node_num * sizeof(int)); + int* d_shard_actual_size_ptr = reinterpret_cast(d_shard_actual_size->ptr()); + + uint64_t* key = (uint64_t*)d_nodes->ptr(); + split_input_to_shard((uint64_t*)(key), d_idx_ptr, node_num, d_left_ptr, d_right_ptr, gpu_id); + + heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, key, d_idx_ptr, node_num, stream); + cudaStreamSynchronize(stream); + + int h_left[total_gpu]; // NOLINT + cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost); + int h_right[total_gpu]; // NOLINT + cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost); + for (int i = 0; i < total_gpu; ++i) { + int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; + if (shard_len == 0) { continue; } - int x2, y2; - int len = range_check(start, start + query_size, size, - size + graph.node_size, x2, y2); - if (len > 0) { - idx.push_back(i); - gpu_begin_pos.emplace_back(x2 - size); - local_begin_pos.emplace_back(actual_size); - sample_size.push_back(len); - actual_size += len; - create_storage(gpu_id, i, 1, len * sizeof(int64_t)); - } - size += graph.node_size; - } - for (int i = 0; i < idx.size(); i++) { - int dev_id_i = resource_->dev_id(idx[i]); - platform::CUDADeviceGuard guard(dev_id_i); - // platform::CUDADeviceGuard guard(i); - auto& node = path_[gpu_id][idx[i]].nodes_.front(); - int grid_size = (sample_size[i] - 1) / block_size_ + 1; - node_query_example<<remote_stream(idx[i], gpu_id)>>>( - gpu_graph_list[idx[i]], gpu_begin_pos[i], sample_size[i], - (int64_t*)node.val_storage); + create_storage(gpu_id, i, shard_len * sizeof(uint64_t), + shard_len * slot_num * sizeof(uint64_t) + shard_len * sizeof(int64_t) + + sizeof(int) * (shard_len + shard_len % 2)); } - for (int i = 0; i < idx.size(); i++) { - cudaStreamSynchronize(resource_->remote_stream(idx[i], gpu_id)); - auto& node = path_[gpu_id][idx[i]].nodes_.front(); - cudaMemcpyAsync(reinterpret_cast(val + local_begin_pos[i]), - node.val_storage, node.val_bytes_len, cudaMemcpyDefault, - node.out_stream); + walk_to_dest(gpu_id, total_gpu, h_left, h_right, (uint64_t*)(d_shard_keys_ptr), NULL); + + for (int i = 0; i < total_gpu; ++i) { + if (h_left[i] == -1) { + continue; + } + int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; + auto& node = path_[gpu_id][i].nodes_.back(); + cudaMemsetAsync(node.val_storage, -1, shard_len * sizeof(int64_t), node.in_stream); + cudaStreamSynchronize(node.in_stream); + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + // If not found, val is -1. + int table_offset = get_table_offset(i, GraphTableType::FEATURE_TABLE, 0); + tables_[table_offset]->get(reinterpret_cast(node.key_storage), + reinterpret_cast(node.val_storage), + h_right[i] - h_left[i] + 1, + resource_->remote_stream(i, gpu_id)); + + int offset = i * feature_table_num_; + auto graph = gpu_graph_fea_list_[offset]; + int64_t* val_array = reinterpret_cast(node.val_storage); + int* actual_size_array = (int*)(val_array + shard_len); + uint64_t* feature_array = (uint64_t*)(actual_size_array + shard_len + shard_len % 2); + dim3 grid((shard_len - 1) / dim_y + 1); + dim3 block(1, dim_y); + get_features_kernel<<remote_stream(i, gpu_id)>>>( + graph, val_array, actual_size_array, feature_array, slot_num, shard_len); } - for (int i = 0; i < idx.size(); i++) { - auto& node = path_[gpu_id][idx[i]].nodes_.front(); - cudaStreamSynchronize(node.out_stream); + + for (int i = 0; i < total_gpu; ++i) { + if (h_left[i] == -1) { + continue; + } + cudaStreamSynchronize(resource_->remote_stream(i, gpu_id)); } - for (auto x : idx) { - destroy_storage(gpu_id, x); + + move_result_to_source_gpu(gpu_id, total_gpu, slot_num, h_left, h_right, + d_shard_vals_ptr, d_shard_actual_size_ptr); + + int grid_size = (node_num - 1) / block_size_ + 1; + uint64_t* result = (uint64_t*)d_feature->ptr(); + fill_dvalues<<>>(d_shard_vals_ptr, result, + d_shard_actual_size_ptr, d_idx_ptr, slot_num, node_num); + + for (int i = 0; i < total_gpu; ++i) { + int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; + if (shard_len == 0) { + continue; + } + destroy_storage(gpu_id, i); } - return result; - */ + + cudaStreamSynchronize(stream); + + return 0; } + } }; #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu index c976bb67cb21e1..4a4b9929370910 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h" +#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" namespace paddle { namespace framework { @@ -25,11 +27,26 @@ void GraphGpuWrapper::set_device(std::vector ids) { device_id_mapping.push_back(device_id); } } -std::vector> GraphGpuWrapper::get_all_id(int type, int idx, + +std::vector> GraphGpuWrapper::get_all_id(int type, int slice_num) { return ((GpuPsGraphTable *)graph_table) - ->cpu_graph_table->get_all_id(type, idx, slice_num); + ->cpu_graph_table_->get_all_id(type, slice_num); } + +std::vector> GraphGpuWrapper::get_all_id(int type, + int idx, + int slice_num) { + return ((GpuPsGraphTable *)graph_table) + ->cpu_graph_table_->get_all_id(type, idx, slice_num); +} + +int GraphGpuWrapper::get_all_feature_ids(int type, int idx, int slice_num, + std::vector>* output) { + return ((GpuPsGraphTable *)graph_table) + ->cpu_graph_table_->get_all_feature_ids(type, idx, slice_num, output); +} + void GraphGpuWrapper::set_up_types(std::vector &edge_types, std::vector &node_types) { id_to_edge = edge_types; @@ -48,31 +65,39 @@ void GraphGpuWrapper::set_up_types(std::vector &edge_types, this->table_feat_conf_feat_shape.resize(node_types.size()); } +void GraphGpuWrapper::set_feature_separator(std::string ch) { + feature_separator_ = ch; + if (graph_table != nullptr) { + ((GpuPsGraphTable *)graph_table) + ->cpu_graph_table_->set_feature_separator(feature_separator_); + } +} + void GraphGpuWrapper::make_partitions(int idx, int64_t byte_size, int device_len) { ((GpuPsGraphTable *)graph_table) - ->cpu_graph_table->make_partitions(idx, byte_size, device_len); + ->cpu_graph_table_->make_partitions(idx, byte_size, device_len); } int32_t GraphGpuWrapper::load_next_partition(int idx) { return ((GpuPsGraphTable *)graph_table) - ->cpu_graph_table->load_next_partition(idx); + ->cpu_graph_table_->load_next_partition(idx); } void GraphGpuWrapper::set_search_level(int level) { - ((GpuPsGraphTable *)graph_table)->cpu_graph_table->set_search_level(level); + ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->set_search_level(level); } -std::vector GraphGpuWrapper::get_partition(int idx, int num) { +std::vector GraphGpuWrapper::get_partition(int idx, int num) { return ((GpuPsGraphTable *)graph_table) - ->cpu_graph_table->get_partition(idx, num); + ->cpu_graph_table_->get_partition(idx, num); } int32_t GraphGpuWrapper::get_partition_num(int idx) { return ((GpuPsGraphTable *)graph_table) - ->cpu_graph_table->get_partition_num(idx); + ->cpu_graph_table_->get_partition_num(idx); } void GraphGpuWrapper::make_complementary_graph(int idx, int64_t byte_size) { ((GpuPsGraphTable *)graph_table) - ->cpu_graph_table->make_complementary_graph(idx, byte_size); + ->cpu_graph_table_->make_complementary_graph(idx, byte_size); } void GraphGpuWrapper::load_edge_file(std::string name, std::string filepath, bool reverse) { @@ -87,7 +112,7 @@ void GraphGpuWrapper::load_edge_file(std::string name, std::string filepath, } if (edge_to_id.find(name) != edge_to_id.end()) { ((GpuPsGraphTable *)graph_table) - ->cpu_graph_table->Load(std::string(filepath), params); + ->cpu_graph_table_->Load(std::string(filepath), params); } } @@ -98,7 +123,7 @@ void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) { if (feature_to_id.find(name) != feature_to_id.end()) { ((GpuPsGraphTable *)graph_table) - ->cpu_graph_table->Load(std::string(filepath), params); + ->cpu_graph_table_->Load(std::string(filepath), params); } } @@ -134,8 +159,9 @@ void GraphGpuWrapper::init_search_level(int level) { search_level = level; } void GraphGpuWrapper::init_service() { table_proto.set_task_pool_size(24); + table_proto.set_shard_num(1000); table_proto.set_search_level(search_level); - table_proto.set_table_name("cpu_graph_table"); + table_proto.set_table_name("cpu_graph_table_"); table_proto.set_use_cache(false); for (int i = 0; i < id_to_edge.size(); i++) table_proto.add_edge_types(id_to_edge[i]); @@ -152,74 +178,95 @@ void GraphGpuWrapper::init_service() { std::shared_ptr resource = std::make_shared(device_id_mapping); resource->enable_p2p(); - GpuPsGraphTable *g = new GpuPsGraphTable(resource, 1); + GpuPsGraphTable *g = + new GpuPsGraphTable(resource, 1, id_to_edge.size()); g->init_cpu_table(table_proto); + g->cpu_graph_table_->set_feature_separator(feature_separator_); graph_table = (char *)g; } void GraphGpuWrapper::upload_batch(int idx, - std::vector> &ids) { + std::vector> &ids) { + debug_gpu_memory_info("upload_batch node start"); GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table; - // std::vector vec; for (int i = 0; i < ids.size(); i++) { - // vec.push_back(g->cpu_graph_table->make_gpu_ps_graph(idx, ids[i])); GpuPsCommGraph sub_graph = - g->cpu_graph_table->make_gpu_ps_graph(idx, ids[i]); - g->build_graph_on_single_gpu(sub_graph, i); + g->cpu_graph_table_->make_gpu_ps_graph(idx, ids[i]); + // sub_graph.display_on_cpu(); + g->build_graph_on_single_gpu(sub_graph, i, idx); sub_graph.release_on_cpu(); VLOG(0) << "sub graph on gpu " << i << " is built"; } - // g->build_graph_from_cpu(vec); + debug_gpu_memory_info("upload_batch node end"); } -// void GraphGpuWrapper::test() { -// int64_t cpu_key[3] = {0, 1, 2}; -// void *key; -// platform::CUDADeviceGuard guard(0); -// cudaMalloc((void **)&key, 3 * sizeof(int64_t)); -// cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice); -// auto neighbor_sample_res = -// ((GpuPsGraphTable *)graph_table) -// ->graph_neighbor_sample(0, (int64_t *)key, 2, 3); -// int64_t *res = new int64_t[7]; -// cudaMemcpy(res, neighbor_sample_res.val, 3 * 2 * sizeof(int64_t), -// cudaMemcpyDeviceToHost); -// int *actual_sample_size = new int[3]; -// cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size, -// 3 * sizeof(int), -// cudaMemcpyDeviceToHost); // 3, 1, 3 +// feature table +void GraphGpuWrapper::upload_batch(std::vector> &node_ids, + int slot_num) { + debug_gpu_memory_info("upload_batch feature start"); + GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table; + for (int i = 0; i < node_ids.size(); i++) { + VLOG(0) << "begin make_gpu_ps_graph_fea, node_ids[" << i << "]_size[" + << node_ids[i].size() << "]"; + GpuPsCommGraphFea sub_graph = g->cpu_graph_table_->make_gpu_ps_graph_fea( + node_ids[i], slot_num); + + // sub_graph.display_on_cpu(); + VLOG(0) << "begin build_graph_fea_on_single_gpu, node_ids[" << i + << "]_size[" << node_ids[i].size() << "]"; + g->build_graph_fea_on_single_gpu(sub_graph, i); + + sub_graph.release_on_cpu(); + + VLOG(0) << "sub graph fea on gpu " << i << " is built"; + } + // g->build_graph_from_cpu(vec); + debug_gpu_memory_info("upload_batch feature end"); +} -// //{0,9} or {9,0} is expected for key 0 -// //{0,2} or {2,0} is expected for key 1 -// //{1,3} or {3,1} is expected for key 2 -// for (int i = 0; i < 3; i++) { -// VLOG(0) << "actual sample size for " << i << " is " -// << actual_sample_size[i]; -// for (int j = 0; j < actual_sample_size[i]; j++) { -// VLOG(0) << "sampled an neighbor for node" << i << " : " << res[i * 2 + -// j]; -// } -// } -// } NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample_v3( NeighborSampleQuery q, bool cpu_switch) { return ((GpuPsGraphTable *)graph_table) ->graph_neighbor_sample_v3(q, cpu_switch); } +int GraphGpuWrapper::get_feature_of_nodes(int gpu_id, + std::shared_ptr d_walk, + std::shared_ptr d_offset, uint32_t size, int slot_num) const { + platform::CUDADeviceGuard guard(gpu_id); + PADDLE_ENFORCE_NOT_NULL(graph_table); + return ((GpuPsGraphTable *)graph_table) + ->get_feature_of_nodes(gpu_id, d_walk, d_offset, size, slot_num); +} + +NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample( + int gpu_id, uint64_t *device_keys, int walk_degree, int len) { + platform::CUDADeviceGuard guard(gpu_id); + auto neighbor_sample_res = + ((GpuPsGraphTable *)graph_table) + ->graph_neighbor_sample(gpu_id, device_keys, walk_degree, len); + + return neighbor_sample_res; +} + // this function is contributed by Liwb5 -std::vector GraphGpuWrapper::graph_neighbor_sample( - int gpu_id, std::vector &key, int sample_size) { - int64_t *cuda_key; +std::vector GraphGpuWrapper::graph_neighbor_sample( + int gpu_id, int idx, std::vector &key, int sample_size) { + std::vector res; + if (key.size() == 0) { + return res; + } + uint64_t *cuda_key; platform::CUDADeviceGuard guard(gpu_id); - cudaMalloc(&cuda_key, key.size() * sizeof(int64_t)); - cudaMemcpy(cuda_key, key.data(), key.size() * sizeof(int64_t), + cudaMalloc(&cuda_key, key.size() * sizeof(uint64_t)); + cudaMemcpy(cuda_key, key.data(), key.size() * sizeof(uint64_t), cudaMemcpyHostToDevice); - + VLOG(0) << "key_size: " << key.size(); auto neighbor_sample_res = ((GpuPsGraphTable *)graph_table) - ->graph_neighbor_sample(gpu_id, cuda_key, sample_size, key.size()); + ->graph_neighbor_sample_v2(gpu_id, idx, cuda_key, sample_size, + key.size(), false); int *actual_sample_size = new int[key.size()]; cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size, key.size() * sizeof(int), @@ -229,11 +276,11 @@ std::vector GraphGpuWrapper::graph_neighbor_sample( cumsum += actual_sample_size[i]; } - std::vector cpu_key, res; + std::vector cpu_key; cpu_key.resize(key.size() * sample_size); cudaMemcpy(cpu_key.data(), neighbor_sample_res.val, - key.size() * sample_size * sizeof(int64_t), + key.size() * sample_size * sizeof(uint64_t), cudaMemcpyDeviceToHost); for (int i = 0; i < key.size(); i++) { for (int j = 0; j < actual_sample_size[i]; j++) { @@ -249,26 +296,19 @@ std::vector GraphGpuWrapper::graph_neighbor_sample( return res; } -void GraphGpuWrapper::init_sample_status() { - ((GpuPsGraphTable *)graph_table)->init_sample_status(); -} - -void GraphGpuWrapper::free_sample_status() { - ((GpuPsGraphTable *)graph_table)->free_sample_status(); -} -NodeQueryResult GraphGpuWrapper::query_node_list(int gpu_id, int start, +NodeQueryResult GraphGpuWrapper::query_node_list(int gpu_id, int idx, int start, int query_size) { return ((GpuPsGraphTable *)graph_table) - ->query_node_list(gpu_id, start, query_size); + ->query_node_list(gpu_id, idx, start, query_size); } void GraphGpuWrapper::load_node_weight(int type_id, int idx, std::string path) { return ((GpuPsGraphTable *)graph_table) - ->cpu_graph_table->load_node_weight(type_id, idx, path); + ->cpu_graph_table_->load_node_weight(type_id, idx, path); } void GraphGpuWrapper::export_partition_files(int idx, std::string file_path) { return ((GpuPsGraphTable *)graph_table) - ->cpu_graph_table->export_partition_files(idx, file_path); + ->cpu_graph_table_->export_partition_files(idx, file_path); } #endif } diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h index a34e752fc7ea7d..7de234a8703169 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h @@ -31,12 +31,13 @@ class GraphGpuWrapper { } static std::shared_ptr s_instance_; void initialize(); - void test(); void set_device(std::vector ids); void init_service(); void set_up_types(std::vector& edge_type, std::vector& node_type); - void upload_batch(int idx, std::vector>& ids); + void upload_batch(int etype_id, std::vector>& ids); + void upload_batch(std::vector>& ids, + int slot_num); void add_table_feat_conf(std::string table_name, std::string feat_name, std::string feat_dtype, int feat_shape); void load_edge_file(std::string name, std::string filepath, bool reverse); @@ -45,22 +46,30 @@ class GraphGpuWrapper { int32_t get_partition_num(int idx); void load_node_weight(int type_id, int idx, std::string path); void export_partition_files(int idx, std::string file_path); - std::vector get_partition(int idx, int num); + std::vector get_partition(int idx, int num); void make_partitions(int idx, int64_t byte_size, int device_len); void make_complementary_graph(int idx, int64_t byte_size); void set_search_level(int level); void init_search_level(int level); - std::vector> get_all_id(int type, int idx, - int slice_num); - NodeQueryResult query_node_list(int gpu_id, int start, int query_size); + std::vector> get_all_id(int type, int slice_num); + std::vector> get_all_id(int type, int idx, + int slice_num); + int get_all_feature_ids(int type, int idx, int slice_num, + std::vector>* output); + NodeQueryResult query_node_list(int gpu_id, int idx, int start, + int query_size); NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q, bool cpu_switch); - std::vector graph_neighbor_sample(int gpu_id, - std::vector& key, - int sample_size); + NeighborSampleResult graph_neighbor_sample(int gpu_id, uint64_t* device_keys, + int walk_degree, int len); + std::vector graph_neighbor_sample(int gpu_id, int idx, + std::vector& key, + int sample_size); + void set_feature_separator(std::string ch); + int get_feature_of_nodes(int gpu_id, + std::shared_ptr d_walk, + std::shared_ptr d_offset, uint32_t size, int slot_num) const; - void init_sample_status(); - void free_sample_status(); std::unordered_map edge_to_id, feature_to_id; std::vector id_to_feature, id_to_edge; std::vector> table_feat_mapping; @@ -71,6 +80,7 @@ class GraphGpuWrapper { std::vector device_id_mapping; int search_level = 1; void* graph_table; + std::string feature_separator_ = std::string(" "); }; #endif } diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h b/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h index a7c043f1edf375..335508217fb04a 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h @@ -81,10 +81,10 @@ class CommonGraphSampler : public GraphSampler { virtual void init(GpuPsGraphTable *g, std::vector args); GpuPsGraphTable *gpu_table; paddle::distributed::GraphTable *table; - std::vector gpu_edges_count; - int64_t cpu_edges_count; - int64_t gpu_edges_limit, cpu_edges_limit, gpu_edges_each_limit; - std::vector> gpu_set; + std::vector gpu_edges_count; + uint64_t cpu_edges_count; + uint64_t gpu_edges_limit, cpu_edges_limit, gpu_edges_each_limit; + std::vector> gpu_set; int gpu_num; }; @@ -101,7 +101,7 @@ class AllInGpuGraphSampler : public GraphSampler { paddle::distributed::GraphTable *graph_table; GpuPsGraphTable *gpu_table; std::vector> sample_nodes; - std::vector> sample_neighbors; + std::vector> sample_neighbors; std::vector sample_res; // std::shared_ptr random; int gpu_num; diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h index ad4b00b11aa39f..ae05398c148444 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h @@ -24,13 +24,14 @@ int CommonGraphSampler::load_from_ssd(std::string path) { std::cout << values.size(); if (values.size() < 2) continue; auto neighbors = paddle::string::split_string(values[1], ";"); - std::vector neighbor_data; + std::vector neighbor_data; for (auto x : neighbors) { neighbor_data.push_back(std::stoll(x)); } auto src_id = std::stoll(values[0]); - _db->put(0, (char *)&src_id, sizeof(uint64_t), (char *)neighbor_data.data(), - sizeof(int64_t) * neighbor_data.size()); + _db->put(0, (char *)&src_id, sizeof(uuint64_t), + (char *)neighbor_data.data(), + sizeof(uint64_t) * neighbor_data.size()); int gpu_shard = src_id % gpu_num; if (gpu_edges_count[gpu_shard] + neighbor_data.size() <= gpu_edges_each_limit) { @@ -49,7 +50,7 @@ int CommonGraphSampler::load_from_ssd(std::string path) { } std::vector graph_list; for (int i = 0; i < gpu_num; i++) { - std::vector ids(gpu_set[i].begin(), gpu_set[i].end()); + std::vector ids(gpu_set[i].begin(), gpu_set[i].end()); graph_list.push_back(table->make_gpu_ps_graph(ids)); } gpu_table->build_graph_from_cpu(graph_list); @@ -69,9 +70,9 @@ void CommonGraphSampler::init(GpuPsGraphTable *g, gpu_edges_each_limit = gpu_edges_limit / gpu_num; if (gpu_edges_each_limit > INT_MAX) gpu_edges_each_limit = INT_MAX; table = g->cpu_graph_table.get(); - gpu_edges_count = std::vector(gpu_num, 0); + gpu_edges_count = std::vector(gpu_num, 0); cpu_edges_count = 0; - gpu_set = std::vector>(gpu_num); + gpu_set = std::vector>(gpu_num); } int AllInGpuGraphSampler::run_graph_sampling() { return 0; } @@ -85,7 +86,7 @@ int AllInGpuGraphSampler::load_from_ssd(std::string path) { sample_res.resize(gpu_num); std::vector>> sample_nodes_ex(graph_table->task_pool_size_); - std::vector>> sample_neighbors_ex( + std::vector>> sample_neighbors_ex( graph_table->task_pool_size_); for (int i = 0; i < graph_table->task_pool_size_; i++) { sample_nodes_ex[i].resize(gpu_num); diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index e2f362d4074589..234aa15ebf74d1 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -118,8 +118,8 @@ class HashTable { StreamType stream); template - void insert(const KeyType* d_keys, size_t len, char* pool, size_t start_index, - StreamType stream); + void insert(const KeyType* d_keys, size_t len, char* pool, + size_t feature_value_size, size_t start_index, StreamType stream); template void get(const KeyType* d_keys, ValType* d_vals, size_t len, diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index 5edc218796ef8a..81da79b768218f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -50,7 +50,8 @@ __global__ void insert_kernel(Table* table, template __global__ void insert_kernel(Table* table, const typename Table::key_type* const keys, - size_t len, char* pool, int start_index) { + size_t len, char* pool, size_t feature_value_size, + int start_index) { ReplaceOp op; thrust::pair kv; @@ -58,7 +59,8 @@ __global__ void insert_kernel(Table* table, if (i < len) { kv.first = keys[i]; - kv.second = (Table::mapped_type)(pool + (start_index + i) * 80); + uint64_t offset = uint64_t(start_index + i) * feature_value_size; + kv.second = (Table::mapped_type)(pool + offset); auto it = table->insert(kv, op); assert(it != table->end() && "error: insert fails: table is full"); } @@ -81,14 +83,29 @@ __global__ void search_kernel(Table* table, template __global__ void dy_mf_search_kernel(Table* table, const typename Table::key_type* const keys, - char* const vals, size_t len, + char* vals, size_t len, size_t pull_feature_value_size) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + // return; if (i < len) { auto it = table->find(keys[i]); if (it != table->end()) { - *(FeatureValue*)(vals + i * pull_feature_value_size) = *(it->second); + uint64_t offset = i * pull_feature_value_size; + FeatureValue* cur = (FeatureValue*)(vals + offset); + FeatureValue& input = *(FeatureValue*)(it->second); + cur->slot = input.slot; + cur->show = input.show; + cur->clk = input.clk; + cur->mf_dim = input.mf_dim; + cur->lr = input.lr; + cur->mf_size = input.mf_size; + cur->cpu_ptr = input.cpu_ptr; + cur->delta_score = input.delta_score; + cur->lr_g2sum = input.lr_g2sum; + for (int j = 0; j < cur->mf_dim + 1; ++j) { + cur->mf[j] = input.mf[j]; + } } } } @@ -121,7 +138,7 @@ __global__ void dy_mf_update_kernel(Table* table, FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size); sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur); } else { - printf("yxf::push miss key: %d", keys[i]); + printf("warning: push miss key: %d", keys[i]); } } } @@ -201,7 +218,8 @@ void HashTable::insert(const KeyType* d_keys, template template void HashTable::insert(const KeyType* d_keys, size_t len, - char* pool, size_t start_index, + char* pool, size_t feature_value_size, + size_t start_index, StreamType stream) { if (len == 0) { return; @@ -210,8 +228,8 @@ void HashTable::insert(const KeyType* d_keys, size_t len, return; } const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; - insert_kernel<<>>(container_, d_keys, len, - pool, start_index); + insert_kernel<<>>( + container_, d_keys, len, pool, feature_value_size, start_index); } template @@ -319,9 +337,12 @@ void HashTable::update(const KeyType* d_keys, } template class HashTable; +template class HashTable; template class HashTable; template class HashTable; template class HashTable; +template class HashTable; +template class HashTable; template class HashTable; template class HashTable; template class HashTable; @@ -331,12 +352,18 @@ template void HashTable::get< paddle::framework::FeatureValue* d_vals, size_t len, cudaStream_t stream); +template void +HashTable::get( + const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream); + template void HashTable::get(const long* d_keys, int* d_vals, size_t len, cudaStream_t stream); template void HashTable::get( const unsigned long* d_keys, int* d_vals, size_t len, cudaStream_t stream); +template void HashTable::get( + const unsigned long* d_keys, long* d_vals, size_t len, cudaStream_t stream); template void HashTable::get( const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream); template void HashTable::get(const long* d_keys, @@ -354,6 +381,11 @@ template void HashTable::insert< const paddle::framework::FeatureValue* d_vals, size_t len, cudaStream_t stream); +template void HashTable:: + insert(const unsigned long* d_keys, size_t len, char* pool, + size_t feature_value_size, size_t start_index, + cudaStream_t stream); + template void HashTable::insert(const long* d_keys, const int* d_vals, size_t len, @@ -366,6 +398,11 @@ template void HashTable::insert(const long* d_keys, template void HashTable::insert( const unsigned long* d_keys, const int* d_vals, size_t len, cudaStream_t stream); + +template void HashTable::insert( + const unsigned long* d_keys, const long* d_vals, size_t len, + cudaStream_t stream); + template void HashTable::insert( const long* d_keys, const unsigned long* d_vals, size_t len, cudaStream_t stream); @@ -374,11 +411,6 @@ template void HashTable::insert( const long* d_keys, const unsigned int* d_vals, size_t len, cudaStream_t stream); -// template void HashTable::insert< -// cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool, -// size_t start_index, cudaStream_t stream); - template void HashTable:: dump_to_cpu(int devid, cudaStream_t stream); @@ -393,6 +425,16 @@ template void HashTable::update< sgd, cudaStream_t stream); +template void +HashTable::update< + Optimizer, + cudaStream_t>(const unsigned long* d_keys, const char* d_grads, size_t len, + Optimizer + sgd, + cudaStream_t stream); + // template void HashTable::update< // Optimizer #include +#include "cub/cub.cuh" +#include "cub/util_allocator.cuh" #if defined(PADDLE_WITH_CUDA) #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/dynload/nccl.h" +#include "paddle/fluid/platform/timer.h" #include "thrust/pair.h" #elif defined(PADDLE_WITH_XPU_KP) // #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h" @@ -38,6 +41,9 @@ limitations under the License. */ namespace paddle { namespace framework { +#define TYPEALIGN(ALIGNVAL, LEN) \ + (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) + template class HeterComm { public: @@ -50,9 +56,13 @@ class HeterComm { int* left, int* right, int gpu_num); void merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, int& uniq_len); // NOLINT + void dynamic_merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, + size_t len, int& uniq_len); void pull_sparse(int num, KeyType* d_keys, ValType* d_vals, size_t len); void build_ps(int num, KeyType* h_keys, ValType* h_vals, size_t len, - size_t chunk_size, int stream_num); + size_t chunk_size, int stream_num, int offset = -1); + void build_ps(int num, KeyType* h_keys, char* pool, size_t len, + size_t feature_value_size, size_t chunk_size, int stream_num); void dump(); void show_one_table(int gpu_num); int get_index_by_devid(int devid); @@ -96,6 +106,11 @@ class HeterComm { nccl_inter_comms_ = inter_comms; node_size_ = comm_size; } + + void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { + multi_mf_dim_ = multi_mf_dim; + max_mf_dim_ = max_mf_dim; + } #endif bool need_transfer(int send_id, int receive_id) { @@ -114,8 +129,8 @@ class HeterComm { char* key_storage; char* val_storage; int sync; - int key_bytes_len; - int val_bytes_len; + size_t key_bytes_len; + size_t val_bytes_len; int dev_num; }; @@ -202,16 +217,22 @@ class HeterComm { #endif } - void create_storage(int start_index, int end_index, int keylen, int vallen); + void create_storage(int start_index, int end_index, size_t keylen, size_t vallen); void destroy_storage(int start_index, int end_index); void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right, KeyType* src_key, GradType* src_val); + void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right, + KeyType* src_key, char* src_val, size_t val_size); void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right, ValType* src_val); + void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right, + char* src_val, size_t val_size); protected: using Table = HashTable; + using PtrTable = HashTable; std::vector tables_; + std::vector ptr_tables_; std::shared_ptr resource_; std::vector> path_; float load_factor_{0.75}; @@ -221,6 +242,7 @@ class HeterComm { private: int topo_aware_{0}; std::vector storage_; + DynamicGradMerger merger_; int feanum_{1800 * 2048}; int multi_node_{0}; int node_size_; @@ -228,6 +250,8 @@ class HeterComm { #if defined(PADDLE_WITH_CUDA) std::vector nccl_inner_comms_; std::vector nccl_inter_comms_; + int multi_mf_dim_{8}; + int max_mf_dim_ = 8; std::vector> allocators_; #endif }; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index d23719ea9eb774..9229076e7fd7ff 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #ifdef PADDLE_WITH_HETERPS #include +#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h" #include "paddle/fluid/platform/device_context.h" #ifdef PADDLE_WITH_XPU_KP @@ -22,20 +23,32 @@ limitations under the License. */ namespace paddle { namespace framework { - template HeterComm::HeterComm( size_t capacity, std::shared_ptr resource) { + VLOG(1) << "Construct new HeterComm"; resource_ = resource; storage_.resize(resource_->total_device()); + multi_mf_dim_ = resource->multi_mf(); for (int i = 0; i < resource_->total_device(); ++i) { #if defined(PADDLE_WITH_CUDA) platform::CUDADeviceGuard guard(resource_->dev_id(i)); allocators_.push_back(std::make_shared( 8, 1, (unsigned int)-1, (size_t)-1, false, false)); // NOLINT #endif - auto table = new Table(capacity / load_factor_); - tables_.push_back(table); + if (!multi_mf_dim_) { + auto table = new Table(capacity / load_factor_); + tables_.push_back(table); + } else { + max_mf_dim_ = resource_->max_mf_dim(); + size_t val_type_size = TYPEALIGN( + 8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); + size_t grad_type_size = TYPEALIGN( + 8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + auto ptr_table = new PtrTable(capacity / load_factor_); + ptr_table->set_feature_value_size(val_type_size, grad_type_size); + ptr_tables_.push_back(ptr_table); + } if (multi_node_) { storage_[i].init(feanum_, resource_->dev_id(i)); } @@ -115,21 +128,21 @@ void HeterComm::memory_copy( template void HeterComm::create_storage(int start_index, int end_index, - int keylen, - int vallen) { + size_t keylen, + size_t vallen) { #if defined(PADDLE_WITH_CUDA) auto& allocator = allocators_[start_index]; auto& nodes = path_[start_index][end_index].nodes_; for (size_t i = 0; i < nodes.size(); ++i) { platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].dev_num)); - allocator->DeviceAllocate( + PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate( resource_->dev_id(nodes[i].dev_num), (void**)&(nodes[i].key_storage), // NOLINT - keylen, resource_->remote_stream(nodes[i].dev_num, start_index)); - allocator->DeviceAllocate( + keylen, resource_->remote_stream(nodes[i].dev_num, start_index))); + PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate( resource_->dev_id(nodes[i].dev_num), (void**)&(nodes[i].val_storage), // NOLINT - vallen, resource_->remote_stream(nodes[i].dev_num, start_index)); + vallen, resource_->remote_stream(nodes[i].dev_num, start_index))); nodes[i].key_bytes_len = keylen; nodes[i].val_bytes_len = vallen; } @@ -157,10 +170,10 @@ void HeterComm::destroy_storage(int start_index, for (size_t i = 0; i < nodes.size(); ++i) { platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].dev_num)); - allocator->DeviceFree(resource_->dev_id(nodes[i].dev_num), - nodes[i].key_storage); - allocator->DeviceFree(resource_->dev_id(nodes[i].dev_num), - nodes[i].val_storage); + PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree( + resource_->dev_id(nodes[i].dev_num), nodes[i].key_storage)); + PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree( + resource_->dev_id(nodes[i].dev_num), nodes[i].val_storage)); } #endif } @@ -238,95 +251,132 @@ void HeterComm::walk_to_dest(int start_index, } template -void HeterComm::walk_to_src(int start_index, - int num, int* h_left, - int* h_right, - ValType* src_val) { +void HeterComm::walk_to_dest( + int start_index, int gpu_num, int* h_left, int* h_right, KeyType* src_key, + char* src_val, size_t val_size) { + int need_copy_val = 0; + if (src_val) { + need_copy_val = 1; + } std::queue que; + for (int i = 0; i < gpu_num; i++) { + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + int size = path_[start_index][i].nodes_.size(); + auto& node = path_[start_index][i].nodes_[0]; + CopyTask t(&path_[start_index][i], 0); + que.push(t); + cudaMemcpyAsync(node.key_storage, + reinterpret_cast(src_key + h_left[i]), + node.key_bytes_len, cudaMemcpyDefault, node.in_stream); + if (need_copy_val) { + cudaMemcpyAsync(node.val_storage, + src_val + uint64_t(h_left[i]) * uint64_t(val_size), + node.val_bytes_len, cudaMemcpyDefault, node.in_stream); + } + } + while (!que.empty()) { + CopyTask& cur_task = que.front(); + que.pop(); + if (cur_task.path->nodes_[cur_task.step].sync) { + cudaStreamSynchronize(cur_task.path->nodes_[cur_task.step].in_stream); + } + if (cur_task.step != cur_task.path->nodes_.size() - 1) { + int cur_step = cur_task.step; + CopyTask c(cur_task.path, cur_step + 1); + que.push(c); + cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage, + cur_task.path->nodes_[cur_step].key_storage, + cur_task.path->nodes_[cur_step + 1].key_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step + 1].in_stream); + if (need_copy_val) { + cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step + 1].val_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step + 1].in_stream); + } + } + } +} - for (int i = 0; i < num; i++) { +template +void HeterComm::walk_to_src( + int start_index, int gpu_num, int* h_left, int* h_right, char* src_val, + size_t val_size) { + std::queue que; + for (int i = 0; i < gpu_num; i++) { if (h_left[i] == -1 || h_right[i] == -1) { continue; } int cur_step = path_[start_index][i].nodes_.size() - 1; auto& node = path_[start_index][i].nodes_[cur_step]; - - auto src_dev_id = resource_->dev_id(i); - auto src_place = DevPlace(src_dev_id); - if (cur_step == 0) { - auto dst_dev_id = resource_->dev_id(start_index); - auto dst_place = DevPlace(dst_dev_id); - memory_copy(dst_place, reinterpret_cast(src_val + h_left[i]), - src_place, node.val_storage, node.val_bytes_len, - node.out_stream); + cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size, + node.val_storage, node.val_bytes_len, cudaMemcpyDefault, + node.out_stream); } else { CopyTask t(&path_[start_index][i], cur_step - 1); que.push(t); - - auto dst_dev_id = - resource_->dev_id(path_[start_index][i].nodes_[cur_step - 1].dev_num); - auto dst_place = DevPlace(dst_dev_id); - - memory_copy(dst_place, - path_[start_index][i].nodes_[cur_step - 1].val_storage, - src_place, node.val_storage, - path_[start_index][i].nodes_[cur_step - 1].val_bytes_len, - path_[start_index][i].nodes_[cur_step - 1].out_stream); + cudaMemcpyAsync(path_[start_index][i].nodes_[cur_step - 1].val_storage, + node.val_storage, + path_[start_index][i].nodes_[cur_step - 1].val_bytes_len, + cudaMemcpyDefault, + path_[start_index][i].nodes_[cur_step - 1].out_stream); } } - while (!que.empty()) { CopyTask& cur_task = que.front(); que.pop(); int cur_step = cur_task.step; if (cur_task.path->nodes_[cur_step].sync) { - sync_stream(cur_task.path->nodes_[cur_step].out_stream); + cudaStreamSynchronize(cur_task.path->nodes_[cur_step].out_stream); } - - auto src_dev_id = - resource_->dev_id(cur_task.path->nodes_[cur_step].dev_num); - auto src_place = DevPlace(src_dev_id); - if (cur_step > 0) { CopyTask c(cur_task.path, cur_step - 1); que.push(c); - - auto dst_dev_id = - resource_->dev_id(cur_task.path->nodes_[cur_step - 1].dev_num); - auto dst_place = DevPlace(dst_dev_id); - - memory_copy(dst_place, cur_task.path->nodes_[cur_step - 1].val_storage, - src_place, cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step - 1].val_bytes_len, - cur_task.path->nodes_[cur_step - 1].out_stream); - + cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step - 1].val_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step - 1].out_stream); } else if (cur_step == 0) { int end_index = cur_task.path->nodes_.back().dev_num; - - auto dst_dev_id = resource_->dev_id(end_index); - auto dst_place = DevPlace(dst_dev_id); - - memory_copy(dst_place, - reinterpret_cast(src_val + h_left[end_index]), - src_place, cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step].val_bytes_len, - cur_task.path->nodes_[cur_step].out_stream); + cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step].val_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step].out_stream); } } } template HeterComm::~HeterComm() { - for (auto& table : tables_) { - delete table; - table = nullptr; + if (!multi_mf_dim_) { + for (auto& table : tables_) { + delete table; + table = nullptr; + } + } else { + for (auto& table : ptr_tables_) { + delete table; + table = nullptr; + } + for (auto& table : tables_) { + delete table; + table = nullptr; + } } } template -void HeterComm::show_one_table(int num) { - tables_[num]->show(); +void HeterComm::show_one_table(int gpu_num) { + if (!multi_mf_dim_) { + tables_[gpu_num]->show(); + } } template @@ -362,7 +412,7 @@ void HeterComm::set_embedx_sgd( template void HeterComm::build_ps( int dev_num, KeyType* h_keys, ValType* h_vals, size_t len, - size_t chunk_size, int stream_num) { + size_t chunk_size, int stream_num, int offset) { if (len <= 0) { return; } @@ -403,8 +453,8 @@ void HeterComm::build_ps( memory_copy( dst_place, reinterpret_cast(d_val_bufs[cur_stream]->ptr()), src_place, h_vals + cur_len, sizeof(ValType) * tmp_len, cur_use_stream); - - tables_[dev_num]->insert( + if (offset == -1) offset = dev_num; + tables_[offset]->insert( reinterpret_cast(d_key_bufs[cur_stream]->ptr()), reinterpret_cast(d_val_bufs[cur_stream]->ptr()), tmp_len, cur_use_stream); @@ -418,59 +468,179 @@ void HeterComm::build_ps( } } +template +void HeterComm::build_ps(int num, KeyType* h_keys, + char* pool, size_t len, + size_t feature_value_size, + size_t chunk_size, + int stream_num) { + if (len <= 0) { + return; + } + int dev_id = resource_->dev_id(num); + + DevPlace place = DevPlace(dev_id); + AnyDeviceGuard guard(dev_id); + + // use hbm pool + std::vector d_key_bufs; + + ppStream streams[stream_num]; // NOLINT + for (int i = 0; i < stream_num; ++i) { + create_stream(&(streams[i])); + auto d_k_buf = memory::Alloc(place, chunk_size * sizeof(KeyType)); + d_key_bufs.push_back(std::move(d_k_buf)); + } + + int cur_len = 0; + int cur_stream = 0; + + while (cur_len < len) { + cur_stream = cur_stream % stream_num; + auto cur_use_stream = streams[cur_stream]; +#if defined(PADDLE_WITH_XPU_KP) + cur_use_stream = 0; +#endif + int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size; + + auto dst_place = place; + auto src_place = platform::CPUPlace(); + + memory_copy( + dst_place, reinterpret_cast(d_key_bufs[cur_stream]->ptr()), + src_place, h_keys + cur_len, sizeof(KeyType) * tmp_len, cur_use_stream); + ptr_tables_[num]->insert( + reinterpret_cast(d_key_bufs[cur_stream]->ptr()), tmp_len, + pool, feature_value_size, cur_len, cur_use_stream); + cur_stream += 1; + cur_len += tmp_len; + } + for (int i = 0; i < stream_num; ++i) { + sync_stream(streams[i]); + destroy_stream(streams[i]); + } +} + template void HeterComm::merge_grad( int dev_num, KeyType* d_keys, GradType* d_grads, size_t len, int& uniq_len) { // NOLINT - int dev_id = resource_->dev_id(dev_num); DevPlace place = DevPlace(dev_id); AnyDeviceGuard guard(dev_id); auto stream = resource_->local_stream(dev_num, 0); - size_t temp_storage_bytes; - auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_merge_keys_ptr = reinterpret_cast(d_merge_keys->ptr()); auto d_merge_grads = memory::Alloc(place, len * sizeof(GradType)); GradType* d_merge_grads_ptr = reinterpret_cast(d_merge_grads->ptr()); - heter_comm_kernel_->sort_pairs(NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false); - auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - heter_comm_kernel_->sort_pairs( d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr, d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false); temp_storage_bytes = 0; - auto d_num_runs_out_mem = memory::Alloc(place, sizeof(int)); int* d_num_runs_out = reinterpret_cast(d_num_runs_out_mem->ptr()); - heter_comm_kernel_->reduce_by_key(NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_merge_grads_ptr, d_grads, d_num_runs_out, len, stream, false); - if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - heter_comm_kernel_->reduce_by_key( d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys, d_merge_grads_ptr, d_grads, d_num_runs_out, len, stream, false); - auto dst_place = platform::CPUPlace(); auto src_place = place; memory_copy(dst_place, &uniq_len, src_place, d_num_runs_out, sizeof(int), stream); - sync_stream(stream); } +template +void HeterComm::dynamic_merge_grad( + int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, + int& uniq_len) { + int dev_id = resource_->dev_id(gpu_num); + platform::CUDAPlace place = platform::CUDAPlace(dev_id); + platform::CUDADeviceGuard guard(dev_id); + auto stream = resource_->local_stream(gpu_num, 0); + + size_t temp_storage_bytes; + + // VLOG(1) << "hetercomm merge_grad: max_mf_dim: " << max_mf_dim_; + size_t grad_value_size = + TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + + auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType)); + KeyType* d_merge_keys_ptr = reinterpret_cast(d_merge_keys->ptr()); + + auto d_merge_grads = memory::Alloc(place, len * grad_value_size); + GradType* d_merge_grads_ptr = + reinterpret_cast(d_merge_grads->ptr()); + + auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1)); + uint32_t* d_fea_num_info_ptr = + reinterpret_cast(d_fea_num_info->ptr()); + uint32_t* d_index = (uint32_t*)&d_fea_num_info_ptr[len]; + uint32_t* d_idx = (uint32_t*)&d_index[len]; + int* d_merged_size = (int*)&d_idx[len]; + int grid_size = (len - 1) / block_size_ + 1; + heter_comm_kernel_->fill_idx(d_idx, len, stream); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( + NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_idx, d_index, len, + 0, 8 * sizeof(KeyType), stream)); + void* d_buff = NULL; + auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( + d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr, + d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + temp_storage_bytes = 0; + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( + NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_fea_num_info_ptr, + d_merged_size, len, stream)); + if (d_temp_storage->size() < temp_storage_bytes) { + d_temp_storage = NULL; + d_temp_storage = memory::Alloc(place, temp_storage_bytes); + } + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( + d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys, + d_fea_num_info_ptr, d_merged_size, len, stream)); + + cudaMemcpyAsync((void*)&uniq_len, d_merged_size, sizeof(int), + cudaMemcpyDeviceToHost, stream); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + + assert(d_merged_size > 0); + uint32_t* d_offset = (uint32_t*)&d_index[len]; + temp_storage_bytes = 0; + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( + NULL, temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len, + stream)); + if (d_temp_storage->size() < temp_storage_bytes) { + d_temp_storage = NULL; + d_temp_storage = memory::Alloc(place, temp_storage_bytes); + } + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( + d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset, + uniq_len, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + heter_comm_kernel_->merge_gradient( + d_offset, d_fea_num_info_ptr, d_index, (char*)d_grads, + (char*)d_merge_grads_ptr, uniq_len, grad_value_size, merger_, stream); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, d_merge_grads_ptr, + grad_value_size * uniq_len, + cudaMemcpyDeviceToDevice, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); +} + template void HeterComm::split_input_to_shard( KeyType* d_keys, int* d_idx_ptr, size_t len, int* left, int* right, @@ -529,8 +699,6 @@ void HeterComm::pull_sparse(int num, AnyDeviceGuard guard(dev_id); auto stream = resource_->local_stream(num, 0); - // int grid_size = (len - 1) / block_size_ + 1; - int h_left[total_device]; // NOLINT int h_right[total_device]; // NOLINT @@ -562,10 +730,11 @@ void HeterComm::pull_sparse(int num, auto d_idx = memory::Alloc(place, len * sizeof(int)); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - + size_t val_type_size = + TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); - auto d_shard_vals = memory::Alloc(place, len * sizeof(ValType)); + auto d_shard_vals = memory::Alloc(place, len * val_type_size); ValType* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num); @@ -589,9 +758,8 @@ void HeterComm::pull_sparse(int num, continue; } create_storage(num, i, shard_len * sizeof(KeyType), - shard_len * sizeof(ValType)); + shard_len * val_type_size); } - walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL); for (int i = 0; i < total_device; ++i) { @@ -600,14 +768,11 @@ void HeterComm::pull_sparse(int num, } auto& node = path_[num][i].nodes_.back(); sync_stream(node.in_stream); - AnyDeviceGuard guard(resource_->dev_id(i)); - - tables_[i]->rwlock_->RDLock(); - tables_[i]->get(reinterpret_cast(node.key_storage), - reinterpret_cast(node.val_storage), - h_right[i] - h_left[i] + 1, - resource_->remote_stream(i, num)); + ptr_tables_[i]->rwlock_->RDLock(); + ptr_tables_[i]->get(reinterpret_cast(node.key_storage), + node.val_storage, h_right[i] - h_left[i] + 1, + resource_->remote_stream(i, num)); } for (int i = 0; i < total_device; ++i) { @@ -615,21 +780,18 @@ void HeterComm::pull_sparse(int num, if (h_left[i] == -1) { continue; } - tables_[i]->rwlock_->UNLock(); + ptr_tables_[i]->rwlock_->UNLock(); } - - walk_to_src(num, total_device, h_left, h_right, d_shard_vals_ptr); - + walk_to_src(num, total_device, h_left, h_right, + reinterpret_cast(d_shard_vals_ptr), val_type_size); for (int i = 0; i < total_device; ++i) { auto& node = path_[num][i].nodes_.front(); sync_stream(node.out_stream); } - - heter_comm_kernel_->fill_dvals(d_shard_vals_ptr, d_vals, d_idx_ptr, len, - stream); + heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, d_vals, d_idx_ptr, len, + val_type_size, stream); sync_stream(stream); - for (int i = 0; i < total_device; ++i) { if (h_left[i] == -1 || h_right[i] == -1) { continue; @@ -653,6 +815,8 @@ void HeterComm::push_sparse(int dev_num, int total_device = resource_->total_device(); int dev_id = resource_->dev_id(dev_num); + size_t grad_value_size = + TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); DevPlace place = DevPlace(dev_id); AnyDeviceGuard guard(dev_id); auto stream = resource_->local_stream(dev_num, 0); @@ -691,21 +855,33 @@ void HeterComm::push_sparse(int dev_num, auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); - auto d_shard_grads = memory::Alloc(place, len * sizeof(GradType)); - GradType* d_shard_grads_ptr = - reinterpret_cast(d_shard_grads->ptr()); + + GradType* d_shard_grads_ptr; + if (!multi_mf_dim_) { + auto d_shard_grads = memory::Alloc(place, len * sizeof(GradType)); + d_shard_grads_ptr = reinterpret_cast(d_shard_grads->ptr()); + } else { + auto d_shard_grads = memory::Alloc(place, len * grad_value_size); + d_shard_grads_ptr = reinterpret_cast(d_shard_grads->ptr()); + } int uniq_len = len; - merge_grad(dev_num, d_keys, d_grads, len, uniq_len); + dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len); - // int grid_size = (uniq_len - 1) / block_size_ + 1; + int grid_size = (uniq_len - 1) / block_size_ + 1; split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num); - heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys, - d_shard_grads_ptr, d_grads, d_idx_ptr, - uniq_len, stream); + if (!multi_mf_dim_) { + heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys, + d_shard_grads_ptr, d_grads, d_idx_ptr, + uniq_len, stream); + } else { + heter_comm_kernel_->dy_mf_fill_shard_grads( + d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr, + uniq_len, grad_value_size, stream); + } sync_stream(stream); @@ -721,12 +897,22 @@ void HeterComm::push_sparse(int dev_num, if (h_left[i] == -1 || h_right[i] == -1) { continue; } - create_storage(dev_num, i, shard_len * sizeof(KeyType), - shard_len * sizeof(GradType)); + if (!multi_mf_dim_) { + create_storage(dev_num, i, shard_len * sizeof(KeyType), + shard_len * sizeof(GradType)); + } else { + create_storage(dev_num, i, shard_len * sizeof(KeyType), + shard_len * grad_value_size); + } } - walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr, - d_shard_grads_ptr); + if (!multi_mf_dim_) { + walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr, + d_shard_grads_ptr); + } else { + walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr, + reinterpret_cast(d_shard_grads_ptr), grad_value_size); + } for (int i = 0; i < total_device; ++i) { if (h_left[i] == -1 || h_right[i] == -1) { @@ -736,17 +922,28 @@ void HeterComm::push_sparse(int dev_num, sync_stream(node.in_stream); AnyDeviceGuard guard(resource_->dev_id(i)); - tables_[i]->rwlock_->WRLock(); - tables_[i]->update(reinterpret_cast(node.key_storage), - reinterpret_cast(node.val_storage), - h_right[i] - h_left[i] + 1, sgd, - resource_->remote_stream(i, dev_num)); + if (!multi_mf_dim_) { + tables_[i]->rwlock_->WRLock(); + tables_[i]->update(reinterpret_cast(node.key_storage), + reinterpret_cast(node.val_storage), + h_right[i] - h_left[i] + 1, sgd, + resource_->remote_stream(i, dev_num)); + } else { + ptr_tables_[i]->rwlock_->WRLock(); + ptr_tables_[i]->update(reinterpret_cast(node.key_storage), + node.val_storage, h_right[i] - h_left[i] + 1, sgd, + resource_->remote_stream(i, dev_num)); + } } for (int i = 0; i < total_device; ++i) { sync_stream(resource_->remote_stream(i, dev_num)); if (h_left[i] != -1) { - tables_[i]->rwlock_->UNLock(); + if (!multi_mf_dim_) { + tables_[i]->rwlock_->UNLock(); + } else { + ptr_tables_[i]->rwlock_->UNLock(); + } } } @@ -1078,11 +1275,13 @@ void HeterComm::end_pass() { tables_[index]->dump_to_cpu(dev_id, stream); }; - for (int i = 0; i < total_device; ++i) { - threads.push_back(std::thread(dump_to_cpu_func, i)); - } - for (auto& t : threads) { - t.join(); + if (!multi_mf_dim_) { + for (int i = 0; i < total_device; ++i) { + threads.push_back(std::thread(dump_to_cpu_func, i)); + } + for (auto& t : threads) { + t.join(); + } } } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu index bdeb696a92bcef..94d7929b2947d2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu @@ -117,6 +117,53 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals, } } +template +__global__ void dy_mf_fill_shard_grads_kernel( + KeyType* d_shard_keys, KeyType* d_keys, GradType* d_shard_grads, + GradType* d_grads, T* idx, size_t len, size_t grad_value_size) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + d_shard_keys[i] = d_keys[idx[i]]; + *(GradType*)((char*)d_shard_grads + i * grad_value_size) = + *(GradType*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size); + } +} + +__global__ void merge_gradients_kernel(const uint32_t* offset, + const uint32_t* fea_num, + const uint32_t* index, const char* input, + char* output, int n, + size_t grad_value_size, + DynamicGradMerger& merger_) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i < n) { + uint32_t start = offset[i]; + uint32_t num = fea_num[i]; + int ori_index = index[start]; + FeaturePushValue& out = *(FeaturePushValue*)(output + i * grad_value_size); + FeaturePushValue& in = + *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); + merger_.update_one(out, in); + for (int j = 1; j < num; ++j) { + ori_index = index[start + j]; + in = *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); + merger_.merge_one(out, in); + } + } +} + +template +__global__ void dy_mf_fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals, + T* idx, size_t len, size_t val_size) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + uint64_t new_offset = uint64_t(idx[i]) * val_size; + *(ValType*)((char*)d_vals + new_offset) = + *(ValType*)((char*)d_shard_vals + i * val_size); + } +} + // cuda implemention of heter_comm_kernel.h template void HeterCommKernel::fill_idx(T* idx, long long len, @@ -207,8 +254,42 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage, debug_synchronous)); } +template +void HeterCommKernel::dy_mf_fill_shard_grads( + KeyType* d_shard_keys, KeyType* d_keys, GradType* d_shard_grads, + GradType* d_grads, T* idx, long long len, size_t grad_value_size, + const StreamType& stream) { + int grid_size = (len - 1) / block_size_ + 1; + size_t c_len = (size_t)len; + dy_mf_fill_shard_grads_kernel<<>>( + d_shard_keys, d_keys, d_shard_grads, d_grads, idx, c_len, + grad_value_size); +} + +template +void HeterCommKernel::merge_gradient( + const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, + const char* input, char* output, int n, size_t grad_value_size, + DynamicGradMerger& merger_, const StreamType& stream) { + int grid_size = (n - 1) / block_size_ + 1; + merge_gradients_kernel<<>>( + offset, fea_num, index, input, output, n, grad_value_size, merger_); +} + +template +void HeterCommKernel::dy_mf_fill_dvals(ValType* d_shard_vals, ValType* d_vals, + T* idx, long long len, size_t val_size, + const StreamType& stream) { + int grid_size = (len - 1) / block_size_ + 1; + size_t c_len = (size_t)len; + dy_mf_fill_dvals_kernel<<>>( + d_shard_vals, d_vals, idx, c_len, val_size); +} + template void HeterCommKernel::fill_idx( int* idx, long long len, const cudaStream_t& stream); +template void HeterCommKernel::fill_idx( + uint32_t* idx, long long len, const cudaStream_t& stream); template void HeterCommKernel::calc_shard_offset( int* idx, int* left, int* right, long long len, int total_devs, @@ -270,6 +351,23 @@ template void HeterCommKernel::reduce_by_key< paddle::framework::FeaturePushValue* d_aggregates_out, int* d_num_runs_out, int num_items, cudaStream_t stream, bool debug_synchronous); +template void HeterCommKernel::dy_mf_fill_shard_grads< + unsigned long, paddle::framework::FeaturePushValue, int, cudaStream_t>( + unsigned long* d_shard_keys, unsigned long* d_keys, + paddle::framework::FeaturePushValue* d_shard_grads, + paddle::framework::FeaturePushValue* d_grads, int* idx, long long len, + size_t grad_value_size, const cudaStream_t& stream); + +template void HeterCommKernel::merge_gradient( + const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, + const char* input, char* output, int n, size_t grad_value_size, + DynamicGradMerger& merger_, const cudaStream_t& stream); + +template void HeterCommKernel::dy_mf_fill_dvals( + paddle::framework::FeatureValue* d_shard_vals, + paddle::framework::FeatureValue* d_vals, int* idx, long long len, + size_t val_size, const cudaStream_t& stream); #endif } // namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h index 9d2ee5d272c722..4f866ccda82017 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h @@ -27,6 +27,42 @@ limitations under the License. */ namespace paddle { namespace framework { +struct DynamicGradMerger { + template + CUB_RUNTIME_FUNCTION __forceinline__ __device__ T + operator()(const T& a, const T& b) const { + T out; + out.slot = a.slot; + out.mf_dim = a.mf_dim; + out.show = a.show + b.show; + out.clk = a.clk + b.clk; + out.lr_g = a.lr_g + b.lr_g; + + return out; + } + + template + __device__ __forceinline__ void update_one(T& output, const T& input) { + output.slot = input.slot; + output.show = input.show; + output.clk = input.clk; + output.mf_dim = input.mf_dim; + output.lr_g = input.lr_g; + for (int i = 0; i < output.mf_dim; ++i) { + output.mf_g[i] = input.mf_g[i]; + } + } + template + __device__ __forceinline__ void merge_one(T& output, const T& input) { + output.show += input.show; + output.clk += input.clk; + output.lr_g += input.lr_g; + for (int i = 0; i < input.mf_dim; ++i) { + output.mf_g[i] += input.mf_g[i]; + } + } +}; + class HeterCommKernel { public: HeterCommKernel() {} @@ -80,6 +116,24 @@ class HeterCommKernel { StreamType stream = NULL, bool debug_synchronous = false); + template + void dy_mf_fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, + GradType* d_shard_grads, GradType* d_grads, + T* idx, long long len, size_t grad_value_size, + const StreamType& stream); + + template + void merge_gradient(const uint32_t* offset, const uint32_t* fea_num, + const uint32_t* index, const char* input, char* output, + int n, size_t grad_value_size, DynamicGradMerger& merger_, + const StreamType& stream); + + template + void dy_mf_fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx, + long long len, size_t val_size, + const StreamType& stream); + private: int block_size_{256}; }; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index 66e06b13b046f4..43b84ee5d26fbe 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -44,6 +44,13 @@ void HeterPs::build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals, comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num); } +void HeterPs::build_ps(int num, FeatureKey* h_keys, char* pool, size_t len, + size_t feature_value_size, size_t chunk_size, + int stream_num) { + comm_->build_ps(num, h_keys, pool, len, feature_value_size, chunk_size, + stream_num); +} + int HeterPs::get_index_by_devid(int devid) { return comm_->get_index_by_devid(devid); } @@ -72,6 +79,10 @@ void HeterPs::set_nccl_comm_and_size(const std::vector& inner_comms, comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size); } +void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { + comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim); +} + } // end namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index 70b88350f2720a..8449a4048b72f9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -37,11 +37,14 @@ class HeterPs : public HeterPsBase { size_t len) override; void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals, size_t len, size_t chunk_size, int stream_num) override; - + void build_ps(int num, FeatureKey* h_keys, char* pool, size_t len, + size_t feature_value_size, size_t chunk_size, + int stream_num) override; #if defined(PADDLE_WITH_CUDA) void set_nccl_comm_and_size(const std::vector& inner_comms, const std::vector& inter_comms, int comm_size) override; + void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override; #endif void set_sparse_sgd(const OptimizerConfig& optimizer_config) override; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index 0727e2c2dbce1c..2c312e9d4d60aa 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -35,11 +35,15 @@ class HeterPsBase { size_t len) = 0; virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals, size_t len, size_t chunk_size, int stream_num) = 0; + virtual void build_ps(int num, FeatureKey* h_keys, char* pool, size_t len, + size_t feature_value_size, size_t chunk_size, + int stream_num) = 0; virtual int get_index_by_devid(int devid) = 0; #if defined(PADDLE_WITH_CUDA) virtual void set_nccl_comm_and_size( const std::vector& inner_comms, const std::vector& inter_comms, int comm_size) = 0; + virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0; #endif virtual void end_pass() = 0; virtual void show_one_table(int gpu_num) = 0; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h index 17bc12a5af1a73..5717f44d400a55 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h @@ -107,6 +107,8 @@ class HeterPsResource { int get_index_by_devid(int devid); int dev_id(int num); void set_multi_mf(int multi_mf_dim, int max_mf_dim); + int multi_mf() { return multi_mf_dim_; } + int max_mf_dim() { return max_mf_dim_; } ppStream local_stream(int dev_num, int stream_num); ppStream remote_stream(int dev_num, int stream_num); diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h index 065d5e6d527fc0..4684b4a0bc155c 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h @@ -125,20 +125,21 @@ class Optimizer { if (optimizer_config.mf_create_thresholds <= optimizer_config.nonclk_coeff * (ptr->show - ptr->clk) + optimizer_config.clk_coeff * ptr->clk) { - // ptr->mf_size = ptr->mf_dim + 1; + ptr->mf_size = ptr->mf_dim + 1; - ptr->mf_size = MF_DIM + 1; + // ptr->mf_size = MF_DIM + 1; ptr->mf[0] = 0; int tid_x = blockIdx.x * blockDim.x + threadIdx.x; curandState state; curand_init(clock64(), tid_x, 0, &state); - for (int i = 0; i < MF_DIM; ++i) { + for (int i = 0; i < ptr->mf_dim; ++i) { ptr->mf[i + 1] = (curand_uniform(&state)) * optimizer_config.mf_initial_range; } } } else { - update_mf(optimizer_config, MF_DIM, &(ptr->mf[1]), ptr->mf[0], grad.mf_g, + update_mf(optimizer_config, ptr->mf_dim, &(ptr->mf[1]), ptr->mf[0], + grad.mf_g, grad.show); // for local test } } diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu index ff3cd9d2d046d1..afeaf0b5541e44 100644 --- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu +++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu @@ -25,9 +25,6 @@ using namespace paddle::framework; namespace platform = paddle::platform; -// paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph -// paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph( -// std::vector ids) std::string edges[] = { std::string("0\t1"), std::string("0\t9"), std::string("1\t2"), @@ -109,13 +106,13 @@ TEST(TEST_FLEET, test_cpu_cache) { std::make_shared(device_id_mapping); resource->enable_p2p(); int use_nv = 1; - GpuPsGraphTable g(resource, use_nv); + GpuPsGraphTable g(resource, use_nv, 1, 2); g.init_cpu_table(table_proto); - g.cpu_graph_table->Load(node_file_name, "nuser"); - g.cpu_graph_table->Load(node_file_name, "nitem"); + g.cpu_graph_table_->Load(node_file_name, "nuser"); + g.cpu_graph_table_->Load(node_file_name, "nitem"); std::remove(node_file_name); std::vector vec; - std::vector node_ids; + std::vector node_ids; node_ids.push_back(37); node_ids.push_back(96); std::vector> node_feat(2, @@ -123,38 +120,29 @@ TEST(TEST_FLEET, test_cpu_cache) { std::vector feature_names; feature_names.push_back(std::string("c")); feature_names.push_back(std::string("d")); - g.cpu_graph_table->get_node_feat(0, node_ids, feature_names, node_feat); + g.cpu_graph_table_->get_node_feat(0, node_ids, feature_names, node_feat); VLOG(0) << "get_node_feat: " << node_feat[0][0]; VLOG(0) << "get_node_feat: " << node_feat[0][1]; VLOG(0) << "get_node_feat: " << node_feat[1][0]; VLOG(0) << "get_node_feat: " << node_feat[1][1]; int n = 10; - std::vector ids0, ids1; + std::vector ids0, ids1; for (int i = 0; i < n; i++) { - g.cpu_graph_table->add_comm_edge(0, i, (i + 1) % n); - g.cpu_graph_table->add_comm_edge(0, i, (i - 1 + n) % n); + g.cpu_graph_table_->add_comm_edge(0, i, (i + 1) % n); + g.cpu_graph_table_->add_comm_edge(0, i, (i - 1 + n) % n); if (i % 2 == 0) ids0.push_back(i); } - g.cpu_graph_table->build_sampler(0); + g.cpu_graph_table_->build_sampler(0); ids1.push_back(5); ids1.push_back(7); - vec.push_back(g.cpu_graph_table->make_gpu_ps_graph(0, ids0)); - vec.push_back(g.cpu_graph_table->make_gpu_ps_graph(0, ids1)); + vec.push_back(g.cpu_graph_table_->make_gpu_ps_graph(0, ids0)); + vec.push_back(g.cpu_graph_table_->make_gpu_ps_graph(0, ids1)); vec[0].display_on_cpu(); vec[1].display_on_cpu(); // g.build_graph_from_cpu(vec); - g.build_graph_on_single_gpu(vec[0], 0); - g.build_graph_on_single_gpu(vec[1], 1); - int64_t cpu_key[3] = {0, 1, 2}; - /* - std::vector> buffers(3); - std::vector actual_sizes(3,0); - g.cpu_graph_table->random_sample_neighbors(cpu_key,2,buffers,actual_sizes,false); - for(int i = 0;i < 3;i++){ - VLOG(0)<<"sample from cpu key->"<set_search_level(2); - // g.cpu_graph_table->Load_to_ssd(edge_file_name,"e>u2u"); - g.cpu_graph_table->Load(edge_file_name, "e>u2u"); - g.cpu_graph_table->make_partitions(0, 64, 2); + g.cpu_graph_table_->clear_graph(0); + g.cpu_graph_table_->set_search_level(2); + g.cpu_graph_table_->Load(edge_file_name, "e>u2u"); + g.cpu_graph_table_->make_partitions(0, 64, 2); int index = 0; - while (g.cpu_graph_table->load_next_partition(0) != -1) { - auto all_ids = g.cpu_graph_table->get_all_id(0, 0, device_len); + while (g.cpu_graph_table_->load_next_partition(0) != -1) { + auto all_ids = g.cpu_graph_table_->get_all_id(0, 0, device_len); for (auto x : all_ids) { for (auto y : x) { VLOG(0) << "part " << index << " " << y; @@ -195,19 +183,19 @@ TEST(TEST_FLEET, test_cpu_cache) { } for (int i = 0; i < all_ids.size(); i++) { GpuPsCommGraph sub_graph = - g.cpu_graph_table->make_gpu_ps_graph(0, all_ids[i]); - g.build_graph_on_single_gpu(sub_graph, i); + g.cpu_graph_table_->make_gpu_ps_graph(0, all_ids[i]); + g.build_graph_on_single_gpu(sub_graph, i, 0); VLOG(2) << "sub graph on gpu " << i << " is built"; } VLOG(0) << "start to iterate gpu graph node"; - g.cpu_graph_table->make_complementary_graph(0, 64); + g.cpu_graph_table_->make_complementary_graph(0, 64); for (int i = 0; i < 2; i++) { // platform::CUDADeviceGuard guard(i); LOG(0) << "query on card " << i; int step = 2; int cur = 0; while (true) { - auto node_query_res = g.query_node_list(i, cur, step); + auto node_query_res = g.query_node_list(i, 0, cur, step); node_query_res.display(); if (node_query_res.get_len() == 0) { VLOG(0) << "no more ids,break"; @@ -215,23 +203,23 @@ TEST(TEST_FLEET, test_cpu_cache) { } cur += node_query_res.get_len(); NeighborSampleQuery query, q1; - query.initialize(i, node_query_res.get_val(), 4, + query.initialize(i, 0, node_query_res.get_val(), 4, node_query_res.get_len()); query.display(); auto c = g.graph_neighbor_sample_v3(query, true); c.display(); platform::CUDADeviceGuard guard(i); - int64_t *key; + uint64_t *key; VLOG(0) << "sample key 1 globally"; - g.cpu_graph_table->set_search_level(2); - cudaMalloc((void **)&key, sizeof(int64_t)); - int64_t t_key = 1; - cudaMemcpy(key, &t_key, sizeof(int64_t), cudaMemcpyHostToDevice); - q1.initialize(i, (int64_t)key, 2, 1); + g.cpu_graph_table_->set_search_level(2); + cudaMalloc((void **)&key, sizeof(uint64_t)); + uint64_t t_key = 1; + cudaMemcpy(key, &t_key, sizeof(uint64_t), cudaMemcpyHostToDevice); + q1.initialize(i, 0, (uint64_t)key, 2, 1); auto d = g.graph_neighbor_sample_v3(q1, true); d.display(); cudaFree(key); - g.cpu_graph_table->set_search_level(1); + g.cpu_graph_table_->set_search_level(1); } } index++; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index f512fcc7b9fdbe..cf9fb14bb9b9cd 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -28,12 +28,16 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" + #include #include -#include "paddle/fluid/framework/data_set.h" -#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/platform/timer.h" +#if defined(PADDLE_WITH_PSCORE) +#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" +#include "paddle/fluid/distributed/ps/table/depends/feature_value.h" +#endif namespace paddle { namespace framework { @@ -107,29 +111,16 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { platform::Timer timeline; timeline.Start(); int device_num = heter_devices_.size(); - if (!multi_mf_dim_) { - gpu_task->init(thread_keys_shard_num_, device_num); - } else { - gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_); - } - auto& local_keys = gpu_task->feature_keys_; - auto& local_ptr = gpu_task->value_ptr_; + gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_); std::vector threads; - // data should be in input channel - if (!multi_mf_dim_) { - thread_keys_.resize(thread_keys_thread_num_); - for (int i = 0; i < thread_keys_thread_num_; i++) { - thread_keys_[i].resize(thread_keys_shard_num_); - } - } else { - thread_dim_keys_.resize(thread_keys_thread_num_); - for (int i = 0; i < thread_keys_thread_num_; i++) { - thread_dim_keys_[i].resize(thread_keys_shard_num_); - for (int j = 0; j < thread_keys_shard_num_; j++) { - thread_dim_keys_[i][j].resize(multi_mf_dim_); - } + + thread_dim_keys_.resize(thread_keys_thread_num_); + for (int i = 0; i < thread_keys_thread_num_; i++) { + thread_dim_keys_[i].resize(thread_keys_shard_num_); + for (int j = 0; j < thread_keys_shard_num_; j++) { + thread_dim_keys_[i][j].resize(multi_mf_dim_); } } @@ -140,68 +131,128 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { std::string data_set_name = std::string(typeid(*dataset_).name()); - if (data_set_name.find("SlotRecordDataset") != std::string::npos) { - VLOG(0) << "ps_gpu_wrapper use SlotRecordDataset"; + VLOG(0) << "gpu_graph_mode_:" << gpu_graph_mode_; + if (!gpu_graph_mode_) { + if (data_set_name.find("SlotRecordDataset") != std::string::npos) { + VLOG(0) << "ps_gpu_wrapper use SlotRecordDataset"; + SlotRecordDataset* dataset = dynamic_cast(dataset_); + auto input_channel = dataset->GetInputChannel(); + VLOG(0) << "psgpu wrapperinputslotchannle size: " + << input_channel->Size(); + const std::deque& vec_data = input_channel->GetData(); + total_len = vec_data.size(); + len_per_thread = total_len / thread_keys_thread_num_; + remain = total_len % thread_keys_thread_num_; + VLOG(0) << "total len: " << total_len; + auto gen_dynamic_mf_func = [this]( + const std::deque& total_data, int begin_index, + int end_index, int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; iter++) { + const auto& ins = *iter; + const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values; + const auto& slot_offset = ins->slot_uint64_feasigns_.slot_offsets; + for (size_t slot_idx = 0; slot_idx < slot_offset_vector_.size(); + slot_idx++) { + for (size_t j = slot_offset[slot_offset_vector_[slot_idx]]; + j < slot_offset[slot_offset_vector_[slot_idx] + 1]; j++) { + int shard_id = feasign_v[j] % thread_keys_shard_num_; + int dim_id = slot_index_vec_[slot_idx]; + if (feasign_v[j] != 0) { + this->thread_dim_keys_[i][shard_id][dim_id].insert( + feasign_v[j]); + } + } + } + } + }; + for (int i = 0; i < thread_keys_thread_num_; i++) { + threads.push_back( + std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin, + begin + len_per_thread + (i < remain ? 1 : 0), i)); + + begin += len_per_thread + (i < remain ? 1 : 0); + } + for (std::thread& t : threads) { + t.join(); + } + timeline.Pause(); + VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() + << " seconds."; + } else { + CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos); + VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset"; + MultiSlotDataset* dataset = dynamic_cast(dataset_); + auto input_channel = dataset->GetInputChannel(); + + const std::deque& vec_data = input_channel->GetData(); + total_len = vec_data.size(); + len_per_thread = total_len / thread_keys_thread_num_; + remain = total_len % thread_keys_thread_num_; + auto gen_func = [this](const std::deque& total_data, + int begin_index, int end_index, int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; iter++) { + const auto& ins = *iter; + const auto& feasign_v = ins.uint64_feasigns_; + for (const auto feasign : feasign_v) { + uint64_t cur_key = feasign.sign().uint64_feasign_; + int shard_id = cur_key % thread_keys_shard_num_; + this->thread_keys_[i][shard_id].insert(cur_key); + } + } + }; + for (int i = 0; i < thread_keys_thread_num_; i++) { + threads.push_back( + std::thread(gen_func, std::ref(vec_data), begin, + begin + len_per_thread + (i < remain ? 1 : 0), i)); + begin += len_per_thread + (i < remain ? 1 : 0); + } + for (std::thread& t : threads) { + t.join(); + } + timeline.Pause(); + VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() + << " seconds."; + } + } else { + VLOG(0) << "PreBuild in GpuGraph mode"; SlotRecordDataset* dataset = dynamic_cast(dataset_); - auto input_channel = dataset->GetInputChannel(); - VLOG(0) << "yxf::buildtask::inputslotchannle size: " - << input_channel->Size(); - const std::deque& vec_data = input_channel->GetData(); + const std::vector& vec_data = dataset->GetGpuGraphTotalKeys(); total_len = vec_data.size(); len_per_thread = total_len / thread_keys_thread_num_; + VLOG(0) << "GpuGraphTotalKeys: " << total_len; remain = total_len % thread_keys_thread_num_; - VLOG(0) << "total len: " << total_len; - auto gen_func = [this](const std::deque& total_data, - int begin_index, int end_index, int i) { - for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; iter++) { - const auto& ins = *iter; - const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values; - for (const auto feasign : feasign_v) { - int shard_id = feasign % thread_keys_shard_num_; - this->thread_keys_[i][shard_id].insert(feasign); - } - } - }; - auto gen_dynamic_mf_func = [this](const std::deque& total_data, + auto gen_graph_data_func = [this](const std::vector& total_data, int begin_index, int end_index, int i) { for (auto iter = total_data.begin() + begin_index; iter != total_data.begin() + end_index; iter++) { - const auto& ins = *iter; - const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values; - const auto& slot_offset = ins->slot_uint64_feasigns_.slot_offsets; - for (size_t slot_idx = 0; slot_idx < slot_offset_vector_.size(); - slot_idx++) { - for (size_t j = slot_offset[slot_offset_vector_[slot_idx]]; - j < slot_offset[slot_offset_vector_[slot_idx] + 1]; j++) { - int shard_id = feasign_v[j] % thread_keys_shard_num_; - int dim_id = slot_index_vec_[slot_idx]; - this->thread_dim_keys_[i][shard_id][dim_id].insert(feasign_v[j]); - } - } + uint64_t cur_key = *iter; + int shard_id = cur_key % thread_keys_shard_num_; + this->thread_keys_[i][shard_id].insert(cur_key); } - /* + }; + auto gen_graph_dynamic_mf_func = [this]( + const std::vector& total_data, int begin_index, int end_index, + int i) { for (auto iter = total_data.begin() + begin_index; iter != total_data.begin() + end_index; iter++) { - const auto& ins = *iter; - const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values; - for (const auto feasign : feasign_v) { - int shard_id = feasign % thread_keys_shard_num_; - this->thread_dim_keys_[i][shard_id][0].insert(feasign); - } + uint64_t cur_key = *iter; + int shard_id = cur_key % thread_keys_shard_num_; + // int dim_id = slot_index_vec_[slot_idx]; + this->thread_dim_keys_[i][shard_id][0].insert(cur_key); } - */ }; for (int i = 0; i < thread_keys_thread_num_; i++) { if (!multi_mf_dim_) { - VLOG(0) << "yxf::psgpu wrapper genfunc"; + VLOG(0) << "psgpu graph wrapper genfunc"; threads.push_back( - std::thread(gen_func, std::ref(vec_data), begin, + std::thread(gen_graph_data_func, std::ref(vec_data), begin, begin + len_per_thread + (i < remain ? 1 : 0), i)); } else { - VLOG(0) << "yxf::psgpu wrapper genfunc with dynamic mf"; + VLOG(0) << "psgpu graph wrapper genfunc with dynamic mf"; threads.push_back( - std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin, + std::thread(gen_graph_dynamic_mf_func, std::ref(vec_data), begin, begin + len_per_thread + (i < remain ? 1 : 0), i)); } begin += len_per_thread + (i < remain ? 1 : 0); @@ -209,54 +260,12 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { for (std::thread& t : threads) { t.join(); } - timeline.Pause(); - VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; - } else { - CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos); - VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset"; - MultiSlotDataset* dataset = dynamic_cast(dataset_); - auto input_channel = dataset->GetInputChannel(); - - const std::deque& vec_data = input_channel->GetData(); - total_len = vec_data.size(); - len_per_thread = total_len / thread_keys_thread_num_; - remain = total_len % thread_keys_thread_num_; - auto gen_func = [this](const std::deque& total_data, - int begin_index, int end_index, int i) { - for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; iter++) { - const auto& ins = *iter; - const auto& feasign_v = ins.uint64_feasigns_; - for (const auto feasign : feasign_v) { - uint64_t cur_key = feasign.sign().uint64_feasign_; - int shard_id = cur_key % thread_keys_shard_num_; - this->thread_keys_[i][shard_id].insert(cur_key); - } - } - }; - for (int i = 0; i < thread_keys_thread_num_; i++) { - threads.push_back( - std::thread(gen_func, std::ref(vec_data), begin, - begin + len_per_thread + (i < remain ? 1 : 0), i)); - begin += len_per_thread + (i < remain ? 1 : 0); - } - for (std::thread& t : threads) { - t.join(); - } - timeline.Pause(); - VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; } timeline.Start(); threads.clear(); // merge thread_keys to shard_keys - auto merge_ins_func = [this, gpu_task](int shard_num) { - for (int i = 0; i < thread_keys_thread_num_; ++i) { - gpu_task->batch_add_keys(shard_num, thread_keys_[i][shard_num]); - thread_keys_[i][shard_num].clear(); - } - }; auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) { for (int i = 0; i < thread_keys_thread_num_; ++i) { gpu_task->batch_add_keys(shard_num, dim_id, @@ -264,19 +273,9 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { thread_dim_keys_[i][shard_num][dim_id].clear(); } }; - // for (size_t i = 0; i < thread_keys_.size(); i++) { - // gpu_task->batch_add_keys(thread_keys_[i]); - // for (int j = 0; j < thread_keys_thread_num_; j++) { - // thread_keys_[i][j].clear(); - // } - //} for (int i = 0; i < thread_keys_shard_num_; ++i) { - if (!multi_mf_dim_) { - threads.push_back(std::thread(merge_ins_func, i)); - } else { - for (int j = 0; j < multi_mf_dim_; j++) { - threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j)); - } + for (int j = 0; j < multi_mf_dim_; j++) { + threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j)); } } for (auto& t : threads) { @@ -291,20 +290,15 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { timeline.Pause(); VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds."; - - if (!multi_mf_dim_) { - for (int i = 0; i < thread_keys_shard_num_; i++) { - VLOG(0) << "GpuPs shard: " << i << " key len: " << local_keys[i].size(); - local_ptr[i].resize(local_keys[i].size()); - } - } else { - for (int i = 0; i < thread_keys_shard_num_; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j] - << " key len: " << gpu_task->feature_dim_keys_[i][j].size(); - gpu_task->value_dim_ptr_[i][j].resize( - gpu_task->feature_dim_keys_[i][j].size()); + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + if (i == 0 && j == multi_mf_dim_ - 1) { + gpu_task->feature_dim_keys_[i][j].push_back(0); } + VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j] + << " key len: " << gpu_task->feature_dim_keys_[i][j].size(); + gpu_task->value_dim_ptr_[i][j].resize( + gpu_task->feature_dim_keys_[i][j].size()); } } } @@ -324,12 +318,12 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { auto& device_dim_keys = gpu_task->device_dim_keys_; auto& device_dim_ptr = gpu_task->device_dim_ptr_; auto& device_dim_mutex = gpu_task->dim_mutex_; - if (multi_mf_dim_) { - for (size_t dev = 0; dev < device_dim_keys.size(); dev++) { - device_dim_keys[dev].resize(multi_mf_dim_); - device_dim_ptr[dev].resize(multi_mf_dim_); - } + + for (size_t dev = 0; dev < device_dim_keys.size(); dev++) { + device_dim_keys[dev].resize(multi_mf_dim_); + device_dim_ptr[dev].resize(multi_mf_dim_); } + // auto& device_mutex = gpu_task->mutex_; std::vector threads(thread_keys_shard_num_); @@ -353,18 +347,17 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { #endif timeline.Start(); - auto ptl_func = [this, &local_keys, &local_ptr, &fleet_ptr](int i) { - size_t key_size = local_keys[i].size(); + + auto ptl_dynamic_mf_func = [this, &local_dim_keys, &local_dim_ptr, + &fleet_ptr](int i, int j) { + size_t key_size = local_dim_keys[i][j].size(); int32_t status = -1; -#ifdef PADDLE_WITH_PSLIB - // auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - // reinterpret_cast(local_ptr[i].data()), this->table_id_, - // local_keys[i].data(), key_size); int32_t cnt = 0; +#ifdef PADDLE_WITH_PSLIB while (true) { auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - i, reinterpret_cast(local_ptr[i].data()), this->table_id_, - local_keys[i].data(), key_size); + i, reinterpret_cast(local_dim_ptr[i][j].data()), + this->table_id_, local_dim_keys[i][j].data(), key_size); bool flag = true; tt.wait(); @@ -392,11 +385,10 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { } #endif #ifdef PADDLE_WITH_PSCORE - int32_t cnt = 0; while (true) { auto tt = fleet_ptr->worker_ptr_->PullSparsePtr( - reinterpret_cast(local_ptr[i].data()), this->table_id_, - local_keys[i].data(), key_size); + reinterpret_cast(local_dim_ptr[i][j].data()), this->table_id_, + local_dim_keys[i][j].data(), key_size); bool flag = true; tt.wait(); @@ -423,51 +415,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { } } #endif - if (status != 0) { - LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; - sleep(300); - exit(-1); - } else { - VLOG(3) << "FleetWrapper Pull sparse to local done with table size: " - << local_keys[i].size(); - } - }; - - auto ptl_dynamic_mf_func = [this, &local_dim_keys, &local_dim_ptr, - &fleet_ptr](int i, int j) { -#ifdef PADDLE_WITH_PSLIB - size_t key_size = local_dim_keys[i][j].size(); - int32_t status = -1; - int32_t cnt = 0; - while (true) { - auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - i, reinterpret_cast(local_dim_ptr[i][j].data()), - this->table_id_, local_dim_keys[i][j].data(), key_size); - bool flag = true; - - tt.wait(); - - try { - status = tt.get(); - } catch (const std::future_error& e) { - VLOG(0) << "Caught a future_error with code" << e.code() - << ", Message:" << e.what(); - } - if (status != 0) { - VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; - sleep(sleep_seconds_before_fail_exit_); - flag = false; - cnt++; - } - if (cnt > 3) { - VLOG(0) << "fleet pull sparse failed, retry 3 times"; - exit(-1); - } - - if (flag) { - break; - } - } if (status != 0) { LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; sleep(300); @@ -476,23 +423,19 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { VLOG(0) << "FleetWrapper Pull sparse to local done with table size: " << local_dim_keys[i][j].size(); } -#endif }; - if (!multi_mf_dim_) { - for (size_t i = 0; i < threads.size(); i++) { - threads[i] = std::thread(ptl_func, i); - } - } else { - threads.resize(thread_keys_shard_num_ * multi_mf_dim_); - for (int i = 0; i < thread_keys_shard_num_; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads[i * multi_mf_dim_ + j] = std::thread(ptl_dynamic_mf_func, i, j); - } + + threads.resize(thread_keys_shard_num_ * multi_mf_dim_); + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + task_futures.emplace_back( + pull_thread_pool_[i]->enqueue(ptl_dynamic_mf_func, i, j)); } } - for (std::thread& t : threads) { - t.join(); + for (auto& f : task_futures) { + f.wait(); } + task_futures.clear(); timeline.Pause(); VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec() << " seconds."; @@ -509,45 +452,40 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { std::vector>> pass_values; bool record_status = false; -#ifdef PADDLE_WITH_PSLIB - uint16_t pass_id = 0; - if (multi_node_) { - record_status = fleet_ptr->pslib_ptr_->_worker_ptr->take_sparse_record( - table_id_, pass_id, pass_values); - } -#endif auto& device_task_keys = gpu_task->device_task_keys_; auto& device_task_ptrs = gpu_task->device_task_ptr_; - auto build_dynamic_mf_func = [this, device_num, &local_dim_keys, - &local_dim_ptr, &device_dim_keys, - &device_dim_ptr, - &device_dim_mutex](int i, int j) { -#ifdef PADDLE_WITH_PSLIB + auto build_pull_dynamic_mf_func = [this, device_num, &local_dim_keys, + &local_dim_ptr, &device_dim_keys, + &device_dim_ptr, + &device_dim_mutex](int i, int j) { std::vector> task_keys(device_num); +#ifdef PADDLE_WITH_PSLIB std::vector> task_ptrs( device_num); +#endif + +#ifdef PADDLE_WITH_PSCORE + std::vector> task_ptrs( + device_num); +#endif for (size_t k = 0; k < local_dim_keys[i][j].size(); k++) { int shard = local_dim_keys[i][j][k] % device_num; task_keys[shard].push_back(local_dim_keys[i][j][k]); task_ptrs[shard].push_back(local_dim_ptr[i][j][k]); } + // allocate local keys to devices for (int dev = 0; dev < device_num; dev++) { - for (int dim = 0; dim < multi_mf_dim_; dim++) { - device_dim_mutex[dev][dim]->lock(); - - int len = task_keys[dev].size(); - int cur = device_dim_keys[dev][dim].size(); - device_dim_keys[dev][dim].resize(device_dim_keys[dev][dim].size() + - len); - device_dim_ptr[dev][dim].resize(device_dim_ptr[dev][dim].size() + len); - for (int k = 0; k < len; ++k) { - device_dim_keys[dev][dim][cur + k] = task_keys[dev][k]; - device_dim_ptr[dev][dim][cur + k] = task_ptrs[dev][k]; - } - device_dim_mutex[dev][dim]->unlock(); + device_dim_mutex[dev][j]->lock(); + int len = task_keys[dev].size(); + int cur = device_dim_keys[dev][j].size(); + device_dim_keys[dev][j].resize(device_dim_keys[dev][j].size() + len); + device_dim_ptr[dev][j].resize(device_dim_ptr[dev][j].size() + len); + for (int k = 0; k < len; ++k) { + device_dim_keys[dev][j][cur + k] = task_keys[dev][k]; + device_dim_ptr[dev][j][cur + k] = task_ptrs[dev][k]; } + device_dim_mutex[dev][j]->unlock(); } -#endif }; auto build_func = [device_num, record_status, &pass_values, &local_keys, &local_ptr, &device_task_keys, &device_task_ptrs](int i) { @@ -697,7 +635,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { for (int i = 0; i < thread_keys_shard_num_; i++) { for (int j = 0; j < multi_mf_dim_; j++) { threads[i * multi_mf_dim_ + j] = - std::thread(build_dynamic_mf_func, i, j); + std::thread(build_pull_dynamic_mf_func, i, j); } } for (std::thread& t : threads) { @@ -727,22 +665,19 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { std::vector feature_keys_count(device_num); size_t size_max = 0; - if (!multi_mf_dim_) { - for (int i = 0; i < device_num; i++) { - feature_keys_count[i] = gpu_task->device_keys_[i].size(); - VLOG(0) << i << " card contains feasign nums: " << feature_keys_count[i]; - size_max = std::max(size_max, feature_keys_count[i]); - } - } else { - for (int i = 0; i < device_num; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - feature_keys_count[i] += gpu_task->device_dim_ptr_[i][j].size(); - } - VLOG(0) << i << " card with dynamic mf contains feasign nums: " - << feature_keys_count[i]; - size_max = std::max(size_max, feature_keys_count[i]); + + for (int i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + feature_keys_count[i] += gpu_task->device_dim_ptr_[i][j].size(); + VLOG(1) << i << " card with dynamic mf dim: " << index_dim_vec_[j] + << " dim index: " << j << " contains feasign nums: " + << gpu_task->device_dim_ptr_[i][j].size(); } + VLOG(1) << i << " card with dynamic mf contains feasign nums total: " + << feature_keys_count[i]; + size_max = std::max(size_max, feature_keys_count[i]); } + if (HeterPs_) { delete HeterPs_; HeterPs_ = nullptr; @@ -756,18 +691,95 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { #ifdef PADDLE_WITH_CUDA HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_); #endif - auto build_func = [this, &gpu_task, &feature_keys_count](int i) { - VLOG(3) << "building table: " << i; - this->HeterPs_->build_ps(i, gpu_task->device_keys_[i].data(), - gpu_task->device_values_[i].data(), - feature_keys_count[i], 500000, 2); - // if (feature_keys_count[i] > 0) { - // HeterPs_->show_one_table(i); - // } + auto build_dynamic_mf_func = [this, &gpu_task](int i, int j) { + this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); + int mf_dim = this->index_dim_vec_[j]; + VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim; + size_t feature_value_size = + TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); + auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; + size_t len = device_dim_keys.size(); + CHECK(len == device_dim_ptrs.size()); + this->mem_pools_[i * this->multi_mf_dim_ + j] = + new MemoryPool(len, feature_value_size); + auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; + for (size_t k = 0; k < len; k++) { + FeatureValue* val = (FeatureValue*)(mem_pool->mem_address(k)); + float* ptr_val = device_dim_ptrs[k]->data(); + size_t dim = device_dim_ptrs[k]->size(); +#ifdef PADDLE_WITH_PSLIB + val->delta_score = + ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::delta_score_index()]; + val->show = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::show_index()]; + val->clk = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::click_index()]; + val->slot = int(ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::slot_index()]); + val->lr = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::embed_w_index()]; + val->lr_g2sum = + ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::embed_g2sum_index()]; + // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor + ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + mf_dim_index()] = float(mf_dim); + val->mf_dim = mf_dim; +#endif +#ifdef PADDLE_WITH_PSCORE + paddle::distributed::CtrDymfAccessor accessor; + val->delta_score = + ptr_val[accessor.common_feature_value.DeltaScoreIndex()]; + val->show = ptr_val[accessor.common_feature_value.ShowIndex()]; + val->clk = ptr_val[accessor.common_feature_value.ClickIndex()]; + val->slot = int(ptr_val[accessor.common_feature_value.SlotIndex()]); + val->lr = ptr_val[accessor.common_feature_value.EmbedWIndex()]; + val->lr_g2sum = ptr_val[accessor.common_feature_value.EmbedG2SumIndex()]; + + val->cpu_ptr = (uint64_t)(device_dim_ptrs[k]); + + // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor + ptr_val[accessor.common_feature_value.MfDimIndex()] = float(mf_dim); + val->mf_dim = mf_dim; +#endif + if (dim > 8) { // CpuPS alreay expand as mf_dim + val->mf_size = mf_dim + 1; + for (int x = 0; x < val->mf_dim + 1; x++) { + val->mf[x] = ptr_val[x + 8]; + } + } else { + val->mf_size = 0; + for (int x = 0; x < val->mf_dim + 1; x++) { + val->mf[x] = 0; + } + } + } + + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + + this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool); + auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; + + this->HeterPs_->build_ps(i, device_dim_keys.data(), cur_pool->mem(), len, + feature_value_size, 500000, 2); + + if (device_dim_keys.size() > 0) { + VLOG(0) << "show ptr table: " << i + << " table kv size: " << device_dim_keys.size() + << "dim: " << mf_dim << " len: " << len; + this->HeterPs_->show_one_table(i); + } + delete mem_pool; }; - for (size_t i = 0; i < threads.size(); i++) { - threads[i] = std::thread(build_func, i); + threads.resize(device_num * multi_mf_dim_); + for (int i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i + j * device_num] = std::thread(build_dynamic_mf_func, i, j); + } } + for (std::thread& t : threads) { t.join(); } @@ -788,10 +800,12 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { if (is_shuffle) { dataset_->LocalShuffle(); } - + InitSlotInfo(); std::shared_ptr gpu_task = gpu_task_pool_.Get(); gpu_task->Reset(); + data_ready_channel_->Put(gpu_task); + VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]"; } @@ -873,13 +887,105 @@ void PSGPUWrapper::EndPass() { timer.Start(); size_t keysize_max = 0; // in case of feasign_num = 0, skip dump_to_cpu + for (size_t i = 0; i < heter_devices_.size(); i++) { - keysize_max = std::max(keysize_max, current_task_->device_keys_[i].size()); + for (int j = 0; j < multi_mf_dim_; j++) { + keysize_max = + std::max(keysize_max, current_task_->device_dim_keys_[i][j].size()); + } + } + + auto dump_pool_to_cpu_func = [this](int i, int j) { + PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i))); + auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; + auto& device_keys = this->current_task_->device_dim_keys_[i][j]; + size_t len = device_keys.size(); + int mf_dim = this->index_dim_vec_[j]; + VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim + << " key_len :" << len; + size_t feature_value_size = + TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); + + char* test_build_values = (char*)malloc(feature_value_size * len); + cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len, + cudaMemcpyDeviceToHost); + + CHECK(len == hbm_pool->capacity()); + uint64_t unuse_key = std::numeric_limits::max(); + for (size_t index = 0; index < len; ++index) { + if (device_keys[index] == unuse_key) { + continue; + } + size_t offset = index * feature_value_size; + FeatureValue* gpu_val = (FeatureValue*)(test_build_values + offset); +#ifdef PADDLE_WITH_PSLIB + auto* downpour_value = + (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr); + int downpour_value_size = downpour_value->size(); + if (gpu_val->mf_size > 0 && downpour_value_size == 8) { + downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size); + } + float* cpu_val = downpour_value->data(); + cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + delta_score_index()] = gpu_val->delta_score; + cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + show_index()] = gpu_val->show; + cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + click_index()] = gpu_val->clk; + cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + embed_w_index()] = gpu_val->lr; + cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + embed_g2sum_index()] = gpu_val->lr_g2sum; + cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + slot_index()] = gpu_val->slot; +#endif +#ifdef PADDLE_WITH_PSCORE + auto* downpour_value = + (paddle::distributed::FixedFeatureValue*)(gpu_val->cpu_ptr); + int downpour_value_size = downpour_value->size(); + if (gpu_val->mf_size > 0 && downpour_value_size == 8) { + downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size); + } + float* cpu_val = downpour_value->data(); + + paddle::distributed::CtrDymfAccessor accessor; + cpu_val[accessor.common_feature_value.DeltaScoreIndex()] = + gpu_val->delta_score; + cpu_val[accessor.common_feature_value.ShowIndex()] = gpu_val->show; + cpu_val[accessor.common_feature_value.ClickIndex()] = gpu_val->clk; + cpu_val[accessor.common_feature_value.EmbedWIndex()] = gpu_val->lr; + cpu_val[accessor.common_feature_value.EmbedG2SumIndex()] = + gpu_val->lr_g2sum; + cpu_val[accessor.common_feature_value.SlotIndex()] = gpu_val->slot; +#endif + if (gpu_val->mf_size > 0) { + for (int x = 0; x < gpu_val->mf_dim + 1; x++) { + cpu_val[x + 8] = gpu_val->mf[x]; + } + } + } + free(test_build_values); + }; + if (multi_mf_dim_) { + VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_; + size_t device_num = heter_devices_.size(); + std::vector threads(device_num * multi_mf_dim_); + for (size_t i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j); + } + } + for (std::thread& t : threads) { + t.join(); + } } if (keysize_max != 0) { HeterPs_->end_pass(); } - + VLOG(0) << "HeterPs_->end_pass end"; + for (size_t i = 0; i < hbm_pools_.size(); i++) { + delete hbm_pools_[i]; + } gpu_task_pool_.Push(current_task_); current_task_ = nullptr; gpu_free_channel_->Put(current_task_); @@ -936,8 +1042,6 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, pull_gpups_timer.Start(); HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu, static_cast(total_length)); - // PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - // "PullSparseGPU failed in GPUPS.")); pull_gpups_timer.Pause(); VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length @@ -945,6 +1049,97 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len, static_cast(slot_lengths.size()), hidden_size, total_length); + } else { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "GpuPs: PullSparse Only Support CUDAPlace Now.")); + } + all_timer.Pause(); + VLOG(3) << "GpuPs PullSparse total costs: " << all_timer.ElapsedSec() + << " s, of which GPUPS costs: " << pull_gpups_timer.ElapsedSec() + << " s"; + VLOG(3) << "End PullSparse"; +} + +void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, + const int table_id, + const std::vector& keys, + const std::vector& values, + const std::vector& slot_lengths, + const std::vector& slot_dim, + const int hidden_size) { + VLOG(3) << "Begine Gpu Ps PullSparse"; + platform::Timer all_timer; + platform::Timer pull_gpups_timer; + all_timer.Start(); + size_t total_length = + std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); + size_t feature_value_size = 0; + + feature_value_size = TYPEALIGN( + 8, sizeof(FeatureValue) + sizeof(float) * (index_dim_vec_.back() + 1)); + +#ifdef PADDLE_WITH_CUDA + VLOG(3) << "Begine Gpu Ps PullSparse"; + auto buf = memory::Alloc(place, total_length * feature_value_size); + FeatureValue* total_values_gpu = reinterpret_cast(buf->ptr()); +#endif +#ifdef PADDLE_WITH_XPU_KP + VLOG(3) << "Begine Xpu Ps PullSparse"; + FeatureValue* total_values_gpu = nullptr; + xpu_malloc(reinterpret_cast(&total_values_gpu), + total_length * feature_value_size); +#endif + if (platform::is_cpu_place(place)) { + PADDLE_THROW(platform::errors::Unimplemented( + "Warning:: CPUPlace is not supported in GpuPs now.")); + } else if (platform::is_gpu_place(place)) { + VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; + int device_id = place.GetDeviceId(); + int devid_2_index = HeterPs_->get_index_by_devid(device_id); + LoDTensor& total_keys_tensor = keys_tensor[devid_2_index]; + uint64_t* total_keys = + reinterpret_cast(total_keys_tensor.mutable_data( + {int64_t(total_length), 1}, place)); + + // construct slot_level lod info + auto slot_lengths_lod = slot_lengths; + for (size_t i = 1; i < slot_lengths_lod.size(); i++) { + slot_lengths_lod[i] += slot_lengths_lod[i - 1]; + } + auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*)); + auto buf_length = + memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); + uint64_t** gpu_keys = reinterpret_cast(buf_key->ptr()); + int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); + cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); + + auto buf_dim = memory::Alloc(place, slot_dim.size() * sizeof(int)); + int* gpu_dim = reinterpret_cast(buf_dim->ptr()); + cudaMemcpy(gpu_dim, slot_dim.data(), slot_dim.size() * sizeof(int), + cudaMemcpyHostToDevice); + + this->CopyKeys(place, gpu_keys, total_keys, gpu_len, + static_cast(slot_lengths.size()), + static_cast(total_length)); + VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index + << " len: " << total_length; + + pull_gpups_timer.Start(); + HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu, + total_length); + + VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length + << "]"; + + this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len, + static_cast(slot_lengths.size()), hidden_size, + total_length, gpu_dim); + + pull_gpups_timer.Pause(); + #endif } else if (platform::is_xpu_place(place)) { #ifdef PADDLE_WITH_XPU_KP @@ -1013,7 +1208,10 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); // #ifdef PADDLE_WITH_CUDA VLOG(3) << "Begin GPUPS PushSparseGrad"; - auto buf = memory::Alloc(place, total_length * sizeof(FeaturePushValue)); + size_t grad_value_size = + TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + auto buf = memory::Alloc(place, total_length * grad_value_size); + VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_; FeaturePushValue* total_grad_values_gpu = reinterpret_cast(buf->ptr()); if (platform::is_cpu_place(place)) { @@ -1027,8 +1225,13 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, uint64_t* total_keys = reinterpret_cast(cached_total_keys_tensor.data()); VLOG(3) << "Begin copy grad tensor to gpups struct"; - this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths, - hidden_size, total_length, batch_size); + if (!multi_mf_dim_) { + this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths, + hidden_size, total_length, batch_size); + } else { + this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths, + total_length, batch_size, grad_value_size); + } VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index << " len: " << total_length; @@ -1060,6 +1263,8 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, "GPUPS: PushSparseGrad Only Support CUDAPlace Now.")); } all_timer.Pause(); + time_3 += all_timer.ElapsedSec(); + time_4 += push_gpups_timer.ElapsedSec(); VLOG(3) << "PushSparseGrad total cost: " << all_timer.ElapsedSec() << " s, of which GPUPS cost: " << push_gpups_timer.ElapsedSec() << " s"; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 3df5a4b473861e..488a9ef8ce78ff 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -61,6 +61,45 @@ __global__ void PullCopy(float** dest, const FeatureValue* src, } } +__global__ void PullCopy(float** dest, const FeatureValue* src, + const int64_t* len, int slot_num, int total_len, + uint64_t** keys, uint64_t max_val_size, int* gpu_dim) { + CUDA_KERNEL_LOOP(i, total_len) { + int low = 0; + int high = slot_num - 1; + while (low < high) { + int mid = (low + high) / 2; + if (i < len[mid]) + high = mid; + else + low = mid + 1; + } + int x = low; + int y = i - (x ? len[x - 1] : 0); + FeatureValue* feature_value_ptr = + (FeatureValue*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); + int mf_dim = gpu_dim[x] - 3; + if (*(keys[x] + y) == 0) { + *(dest[x] + y * (mf_dim + 3)) = 0; + *(dest[x] + y * (mf_dim + 3) + 1) = 0; + *(dest[x] + y * (mf_dim + 3) + 2) = 0; + } else { + *(dest[x] + y * (mf_dim + 3)) = feature_value_ptr->show; + *(dest[x] + y * (mf_dim + 3) + 1) = feature_value_ptr->clk; + *(dest[x] + y * (mf_dim + 3) + 2) = feature_value_ptr->lr; + } + if ((feature_value_ptr)->mf_size == 0 || *(keys[x] + y) == 0) { + for (int j = 0; j < mf_dim; j++) { + *(dest[x] + y * (mf_dim + 3) + 3 + j) = 0; + } + } else { + for (int j = 0; j < mf_dim; j++) { + *(dest[x] + y * (mf_dim + 3) + 3 + j) = feature_value_ptr->mf[1 + j]; + } + } + } +} + __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys, const int64_t* len, int slot_num, int total_len) { @@ -105,6 +144,35 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, int64_t* len, } } +__global__ void PushCopyWithPool(FeaturePushValue* dest, float** src, + int64_t* len, int slot_num, uint64_t total_len, + int bs, int* slot_vector, int* mf_dim_vector, + size_t grad_value_size) { + CUDA_KERNEL_LOOP(i, total_len) { + int low = 0; + int high = slot_num - 1; + while (low < high) { + int mid = (low + high) / 2; + if (i < len[mid]) + high = mid; + else + low = mid + 1; + } + int x = low; + int y = i - (x ? len[low - 1] : 0); + FeaturePushValue* cur = + (FeaturePushValue*)((char*)dest + i * grad_value_size); + cur->slot = slot_vector[x]; + int mf_dim = mf_dim_vector[x]; + cur->mf_dim = mf_dim; + cur->show = *(src[x] + y * (mf_dim + 3)); + cur->clk = *(src[x] + y * (mf_dim + 3) + 1); + cur->lr_g = *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs; + for (int j = 0; j < cur->mf_dim; j++) { + cur->mf_g[j] = *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs; + } + } +} PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; } void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, @@ -128,6 +196,26 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, cudaStreamSynchronize(stream); } +void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const FeatureValue* total_values_gpu, + const int64_t* gpu_len, const int slot_num, + const int hidden_size, + const int64_t total_length, int* gpu_dim) { + auto stream = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); + float** gpu_values = reinterpret_cast(buf_value->ptr()); + cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*), + cudaMemcpyHostToDevice); + PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + gpu_values, total_values_gpu, gpu_len, slot_num, total_length, gpu_keys, + val_type_size_, gpu_dim); + cudaStreamSynchronize(stream); +} + void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, const int64_t* gpu_len, int slot_num, @@ -177,6 +265,45 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, cudaStreamSynchronize(stream); } +void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + FeaturePushValue* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, + const int batch_size, size_t grad_value_size) { + auto stream = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + auto slot_lengths_lod = slot_lengths; + for (int i = 1; i < slot_lengths_lod.size(); i++) { + slot_lengths_lod[i] += slot_lengths_lod[i - 1]; + } + auto buf_grad_value = + memory::Alloc(place, grad_values.size() * sizeof(float*)); + auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); + auto buf_slot_vector = + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); + auto buf_mf_dim_vector = + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); + float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); + int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); + int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); + int* d_mf_dim_vector = reinterpret_cast(buf_mf_dim_vector->ptr()); + cudaMemcpy(gpu_values, grad_values.data(), + grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_slot_vector, slot_vector_.data(), + slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_mf_dim_vector, slot_mf_dim_vector_.data(), + slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); + PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + total_grad_values_gpu, gpu_values, gpu_len, slot_lengths.size(), + total_length, batch_size, d_slot_vector, d_mf_dim_vector, + grad_value_size); + cudaStreamSynchronize(stream); +} + void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff, float min_bound, float max_bound, float learning_rate, float initial_g2sum, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index c38b819822c28b..3addf23ce82071 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -27,6 +27,7 @@ limitations under the License. */ #include #ifdef PADDLE_WITH_GLOO #include +#include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif #include "paddle/fluid/distributed/ps/thirdparty/round_robin.h" @@ -54,6 +55,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_PSLIB #include "afs_api.h" #endif +#ifdef PADDLE_WITH_PSLIB +#include "downpour_accessor.h" // NOLINT +#endif namespace paddle { namespace framework { @@ -95,12 +99,21 @@ class PSGPUWrapper { PSGPUWrapper() { HeterPs_ = NULL; sleep_seconds_before_fail_exit_ = 300; + pull_thread_pool_.resize(thread_keys_shard_num_); + for (size_t i = 0; i < pull_thread_pool_.size(); i++) { + pull_thread_pool_[i].reset(new ::ThreadPool(1)); + } hbm_thread_pool_.resize(thread_keys_shard_num_); for (size_t i = 0; i < hbm_thread_pool_.size(); i++) { hbm_thread_pool_[i].reset(new ::ThreadPool(1)); } } + void PullSparse(const paddle::platform::Place& place, const int table_id, + const std::vector& keys, + const std::vector& values, + const std::vector& slot_lengths, + const std::vector& slot_dim, const int hidden_size); void PullSparse(const paddle::platform::Place& place, const int table_id, const std::vector& keys, const std::vector& values, @@ -119,13 +132,23 @@ class PSGPUWrapper { const FeatureValue* total_values_gpu, const int64_t* gpu_len, const int slot_num, const int hidden_size, const int64_t total_length); - + void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, + const std::vector& values, + const FeatureValue* total_values_gpu, const int64_t* gpu_len, + const int slot_num, const int hidden_size, + const int64_t total_length, int* gpu_dim); void CopyForPush(const paddle::platform::Place& place, const std::vector& grad_values, FeaturePushValue* total_grad_values_gpu, const std::vector& slot_lengths, const int hidden_size, const int64_t total_length, const int batch_size); + void CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + FeaturePushValue* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, const int batch_size, + size_t grad_value_size); void BuildGPUTask(std::shared_ptr gpu_task); void PreBuildTask(std::shared_ptr gpu_task); @@ -310,13 +333,40 @@ class PSGPUWrapper { void SetSlotOffsetVector(const std::vector& slot_offset_vector) { slot_offset_vector_ = slot_offset_vector; + std::cout << "yxf set: "; + for (auto s : slot_offset_vector_) { + std::cout << s << " | "; + } + std::cout << " end " << std::endl; } #ifdef PADDLE_WITH_CUDA void SetSlotDimVector(const std::vector& slot_mf_dim_vector) { slot_mf_dim_vector_ = slot_mf_dim_vector; assert(slot_mf_dim_vector_.size() == slot_vector_.size()); - for (size_t i = 0; i < slot_mf_dim_vector.size(); i++) { + } + + void InitSlotInfo() { + if (slot_info_initialized_) { + return; + } + SlotRecordDataset* dataset = dynamic_cast(dataset_); + auto slots_vec = dataset->GetSlots(); + slot_offset_vector_.clear(); + for (auto& slot : slot_vector_) { + for (size_t i = 0; i < slots_vec.size(); ++i) { + if (std::to_string(slot) == slots_vec[i]) { + slot_offset_vector_.push_back(i); + break; + } + } + } + std::cout << "psgpu wrapper use slots: "; + for (auto s : slot_offset_vector_) { + std::cout << s << " | "; + } + std::cout << " end " << std::endl; + for (size_t i = 0; i < slot_mf_dim_vector_.size(); i++) { slot_dim_map_[slot_vector_[i]] = slot_mf_dim_vector_[i]; } @@ -345,6 +395,7 @@ class PSGPUWrapper { TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); grad_type_size_ = TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + slot_info_initialized_ = true; } #endif @@ -385,9 +436,16 @@ class PSGPUWrapper { int max_mf_dim_{0}; size_t val_type_size_{0}; size_t grad_type_size_{0}; + + double time_1 = 0.0; + double time_2 = 0.0; + double time_3 = 0.0; + double time_4 = 0.0; + int multi_node_{0}; int node_size_; uint64_t table_id_; + int gpu_graph_mode_ = 1; #ifdef PADDLE_WITH_CUDA std::vector inner_comms_; std::vector inter_comms_; @@ -405,6 +463,7 @@ class PSGPUWrapper { int year_; int month_; int day_; + bool slot_info_initialized_ = false; int use_afs_api_ = 0; #ifdef PADDLE_WITH_CUDA @@ -428,6 +487,7 @@ class PSGPUWrapper { std::shared_ptr current_task_ = nullptr; std::thread pre_build_threads_; bool running_ = false; + std::vector> pull_thread_pool_; std::vector> hbm_thread_pool_; protected: diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index cb33e87f490c25..cee122e540f7e1 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -118,6 +118,11 @@ void HogwildWorker::CreateDeviceResource(const ProgramDesc &main_prog) { void HogwildWorker::TrainFilesWithProfiler() { platform::SetNumThreads(1); +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + platform::SetDeviceId(thread_id_); +#elif defined(PADDLE_WITH_XPU_BKCL) + platform::SetXPUDeviceId(thread_id_); +#endif device_reader_->Start(); std::vector op_total_time; std::vector op_name; @@ -174,8 +179,6 @@ void HogwildWorker::TrainFilesWithProfiler() { PrintFetchVars(); #ifdef PADDLE_WITH_HETERPS dev_ctx_->Wait(); - VLOG(1) << "GpuPs worker " << thread_id_ << " train cost " << total_time - << " seconds, ins_num: " << total_inst; for (size_t i = 0; i < op_name.size(); ++i) { VLOG(1) << "card:" << thread_id_ << ", op: " << op_name[i] << ", mean time: " << op_total_time[i] / total_inst @@ -197,6 +200,9 @@ void HogwildWorker::TrainFilesWithProfiler() { thread_scope_->DropKids(); timeline.Start(); } + VLOG(0) << "GpuPs worker " << thread_id_ << " train cost " << total_time + << " seconds, ins_num: " << total_inst << " read time: " << read_time + << "seconds "; if (need_dump_field_ || need_dump_param_) { writer_.Flush(); @@ -213,12 +219,21 @@ void HogwildWorker::TrainFiles() { platform::SetNumThreads(1); platform::Timer timeline; timeline.Start(); +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + platform::SetDeviceId(thread_id_); +#elif defined(PADDLE_WITH_XPU_BKCL) + platform::SetXPUDeviceId(thread_id_); +#endif int total_ins_num = 0; // how to accumulate fetched values here device_reader_->Start(); int cur_batch; int batch_cnt = 0; + +#if defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_CUDA) + platform::SetDeviceId(thread_id_); +#endif while ((cur_batch = device_reader_->Next()) > 0) { for (auto &op : ops_) { bool need_skip = false; @@ -244,9 +259,12 @@ void HogwildWorker::TrainFiles() { ++batch_cnt; PrintFetchVars(); thread_scope_->DropKids(); +#ifdef PADDLE_WITH_HETERPS + dev_ctx_->Wait(); +#endif } timeline.Pause(); - VLOG(3) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec() + VLOG(0) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec() << " seconds, ins_num: " << total_ins_num; if (need_dump_field_ || need_dump_param_) { diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc index 7a83fdccc218c4..6479f7ae726548 100644 --- a/paddle/fluid/framework/multi_trainer.cc +++ b/paddle/fluid/framework/multi_trainer.cc @@ -148,6 +148,17 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program, } } #endif + for (auto& var : main_program.Block(0).AllVars()) { + if (var->Persistable()) { + auto it = std::find(need_merge_var_names_.begin(), + need_merge_var_names_.end(), var->Name()); + if (it == need_merge_var_names_.end() && + var->GetType() != proto::VarType::SELECTED_ROWS) { + VLOG(2) << "train param: " << var->Name(); + trainable_param_.push_back(var->Name()); + } + } + } } void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) { @@ -192,18 +203,30 @@ void MultiTrainer::Run() { #ifdef PADDLE_WITH_HETERPS void MultiTrainer::MergeDenseParam() { -#ifdef PADDLE_WTIH_PSCORE +#ifdef PADDLE_WITH_PSCORE auto communicator = paddle::distributed::Communicator::GetInstance(); - auto& recv_ctx = communicator->GetRecvCtxMap(); - Scope* thread_scope = workers_[0]->GetThreadScope(); - for (auto& iter : recv_ctx) { - auto& varnames = iter.second; - for (auto& name : varnames) { + auto thread_scope = workers_[0]->GetThreadScope(); + if (communicator == nullptr) { + for (auto& name : trainable_param_) { + VLOG(2) << "merge var " << name << " to root scope"; Variable* root_var = root_scope_->FindVar(name); LoDTensor* root_tensor = root_var->GetMutable(); Variable* var = thread_scope->FindVar(name); LoDTensor* tensor = var->GetMutable(); - TensorCopy((*tensor), root_tensor->place(), root_tensor); + TensorCopySync((*tensor), root_tensor->place(), root_tensor); + } + } else { + auto& recv_ctx = communicator->GetRecvCtxMap(); + for (auto& iter : recv_ctx) { + auto& varnames = iter.second; + for (auto& name : varnames) { + VLOG(2) << "merge var " << name << " to root scope"; + Variable* root_var = root_scope_->FindVar(name); + LoDTensor* root_tensor = root_var->GetMutable(); + Variable* var = thread_scope->FindVar(name); + LoDTensor* tensor = var->GetMutable(); + TensorCopySync((*tensor), root_tensor->place(), root_tensor); + } } } #endif @@ -236,11 +259,7 @@ void MultiTrainer::Finalize() { } LoDTensor* root_tensor = root_var->GetMutable(); -#ifdef PADDLE_WITH_HETERPS - for (size_t j = 0; j < places_.size(); j++) { -#else for (int j = 1; j < thread_num_; j++) { -#endif Scope* cur_thread_scope = workers_[j]->GetThreadScope(); Variable* thread_var = cur_thread_scope->FindVar(need_merge_var_names_[i]); diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt index 6046000739976c..92643a254f8efb 100644 --- a/paddle/fluid/framework/new_executor/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/CMakeLists.txt @@ -27,8 +27,8 @@ cc_library(staticgraph_executor_statistics SRCS executor_statistics.cc DEPS enfo if (WITH_GPU AND WITH_TESTING AND NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") add_custom_target( download_program - COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_main_program - COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_startup_program + COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_main_program --no-check-certificate + COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_startup_program --no-check-certificate ) # all operators used in the program diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc index ad1ddbfabd0911..b7674e06b9f73d 100644 --- a/paddle/fluid/framework/ps_gpu_worker.cc +++ b/paddle/fluid/framework/ps_gpu_worker.cc @@ -128,16 +128,16 @@ void PSGPUWorker::TrainFiles() { timeline.Start(); int total_ins_num = 0; - - // how to accumulate fetched values here - device_reader_->Start(); - int cur_batch; - int batch_cnt = 0; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) platform::SetDeviceId(thread_id_); #elif defined(PADDLE_WITH_XPU_BKCL) platform::SetXPUDeviceId(thread_id_); #endif + + // how to accumulate fetched values here + device_reader_->Start(); + int cur_batch; + int batch_cnt = 0; while ((cur_batch = device_reader_->Next()) > 0) { total_ins_num += cur_batch; for (auto& op : ops_) { diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index b86b4fec8a5718..c78f7611b63bee 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -129,6 +129,7 @@ class MultiTrainer : public TrainerBase { std::vector readers_; std::vector> workers_; std::vector need_merge_var_names_; + std::vector trainable_param_; #ifdef PADDLE_WITH_HETERPS std::vector places_; #endif diff --git a/paddle/fluid/inference/api/paddle_infer_declare.h b/paddle/fluid/inference/api/paddle_infer_declare.h index e8525f440fe7f2..44eacc6a70554a 100644 --- a/paddle/fluid/inference/api/paddle_infer_declare.h +++ b/paddle/fluid/inference/api/paddle_infer_declare.h @@ -23,5 +23,7 @@ #endif // PADDLE_DLL_INFERENCE #endif // PD_INFER_DECL #else +#ifndef PD_INFER_DECL #define PD_INFER_DECL __attribute__((visibility("default"))) +#endif // PD_INFER_DECL #endif // _WIN32 diff --git a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc new file mode 100644 index 00000000000000..c8ab269c023a5b --- /dev/null +++ b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc @@ -0,0 +1,129 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +constexpr int64_t kNoPadding = -1; + +template +class LookupTableV2MLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *ids_t = ctx.Input("Ids"); // int tensor + auto *output_t = ctx.Output("Out"); // float tensor + auto *table_t = ctx.Input("W"); + + auto *table_var = ctx.InputVar("W"); + PADDLE_ENFORCE_EQ( + table_var->IsType(), true, + platform::errors::InvalidArgument("mlu only accept LoDTensor")); + output_t->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc ids_desc(*ids_t); + MLUCnnlTensorDesc table_desc(*table_t); + MLUCnnlTensorDesc output_desc(*output_t); + + int64_t padding_idx = ctx.Attr("padding_idx"); + if (padding_idx == kNoPadding) { + MLUCnnl::GatherFunctor(ctx, /*axis=*/0, /*batch_dims=*/0, + table_desc.get(), GetBasePtr(table_t), + ids_desc.get(), GetBasePtr(ids_t), + output_desc.get(), GetBasePtr(output_t)); + } else { + Tensor tmp_table_t(table_t->type()); + tmp_table_t.mutable_data(table_t->dims(), ctx.GetPlace()); + + Tensor index; + index.mutable_data({1, 1}, ctx.GetPlace()); + auto idx_value = static_cast(padding_idx); + MLUCnnlTensorDesc index_desc(index); + MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &idx_value, index_desc.get(), + GetBasePtr(&index)); + + auto update_dim = phi::make_ddim({1, table_t->dims()[1]}); + Tensor update; + update.mutable_data(update_dim, ctx.GetPlace()); + + auto update_value = static_cast(0); + MLUCnnlTensorDesc update_desc(update); + MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &update_value, + update_desc.get(), GetBasePtr(&update)); + + MLUCnnlTensorDesc tmp_table_desc(tmp_table_t); + MLUCnnl::ScatterNd( + ctx, CNNL_SCATTERND_UPDATE, index_desc.get(), GetBasePtr(&index), + update_desc.get(), GetBasePtr(&update), table_desc.get(), + GetBasePtr(table_t), tmp_table_desc.get(), GetBasePtr(&tmp_table_t)); + + MLUCnnl::GatherFunctor(ctx, /*axis=*/0, /*batch_dims=*/0, + tmp_table_desc.get(), GetBasePtr(&tmp_table_t), + ids_desc.get(), GetBasePtr(ids_t), + output_desc.get(), GetBasePtr(output_t)); + } + } +}; + +template +class LookupTableV2GradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *ids_t = ctx.Input("Ids"); + auto *output_grad_t = + ctx.Input(framework::GradVarName("Out")); + auto *table_grad_t = + ctx.Output(framework::GradVarName("W")); + table_grad_t->mutable_data(ctx.GetPlace()); + + int padding_idx = static_cast(ctx.Attr("padding_idx")); + + Tensor ids_int32(ids_t->dtype()); + if (ids_t->dtype() != DataType::INT32) { + ids_int32.mutable_data(ids_t->dims(), ctx.GetPlace()); + MLUCnnlTensorDesc ids_desc(*ids_t); + MLUCnnlTensorDesc ids_int32_desc(ids_int32); + auto cast_type = GetCastDataType(ids_t->dtype(), DataType::INT32); + MLUCnnl::Cast(ctx, cast_type, ids_desc.get(), GetBasePtr(ids_t), + ids_int32_desc.get(), GetBasePtr(&ids_int32)); + } else { + ids_int32 = *ids_t; + } + + MLUCnnlTensorDesc ids_int32_desc(ids_int32); + MLUCnnlTensorDesc output_grad_desc(*output_grad_t); + MLUCnnlTensorDesc table_grad_desc(*table_grad_t); + + MLUCnnl::EmbeddingBackward(ctx, padding_idx, false, ids_int32_desc.get(), + GetBasePtr(&ids_int32), output_grad_desc.get(), + GetBasePtr(output_grad_t), table_grad_desc.get(), + GetBasePtr(table_grad_t)); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(lookup_table_v2, ops::LookupTableV2MLUKernel, + ops::LookupTableV2MLUKernel, + ops::LookupTableV2MLUKernel); + +REGISTER_OP_MLU_KERNEL(lookup_table_v2_grad, + ops::LookupTableV2GradMLUKernel, + ops::LookupTableV2GradMLUKernel, + ops::LookupTableV2GradMLUKernel); diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index ecee094de346e6..393247644c2e88 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -44,14 +44,6 @@ class MKLDNNActivationKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - PADDLE_ENFORCE_EQ( - x->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for X tensor")); - PADDLE_ENFORCE_NE( - x->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for X tensor")); - Functor functor; functor(ctx); } @@ -62,14 +54,6 @@ class MKLDNNActivationGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - const auto *diff_y = ctx.Input(framework::GradVarName("Out")); - PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "Wrong layout set for Input OutGrad tensor")); - PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Wrong format set for Input OutGrad tensor")); - Functor functor; functor(ctx); } diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc index b10572edf6f273..747e4603d7fe77 100644 --- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc @@ -36,100 +36,58 @@ template class DeQuantOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto scale_data = ctx.Attr("Scale"); - auto scale_shift = ctx.Attr("Shift"); - bool with_shift = scale_shift != 0.0f; - auto* output = ctx.Output("Output"); - - PADDLE_ENFORCE_NE(scale_data, 0.0f, - platform::errors::InvalidArgument( - "Dequantization scale cannot be 0.0")); - PADDLE_ENFORCE_GE(scale_shift, 0, - platform::errors::Unimplemented( - "Dequantization shift must be nonnegative.")); - PADDLE_ENFORCE_LE( - scale_shift, 255, - platform::errors::Unimplemented( - "Dequantization shift must be less than or equal to 255.")); + auto* x = ctx.Input("Input"); + const auto quantization_scale = ctx.Attr("Scale"); + const auto quantization_shift = ctx.Attr("Shift"); + const bool with_shift = quantization_shift != 0.0f; + auto* out = ctx.Output("Output"); + + PADDLE_ENFORCE(quantization_scale != 0.0f, + platform::errors::InvalidArgument( + "Dequantization scale must be different than 0.0f")); + + PADDLE_ENFORCE( + quantization_shift <= 255 && quantization_shift >= 0, + platform::errors::InvalidArgument( + "Dequantization shift must be lower or equal to ", + "255 and greater or equal to 0, but got %f", quantization_shift)); auto& dev_ctx = ctx.template device_context(); - const auto& engine = dev_ctx.GetEngine(); - - const T* input_data = input->data(); - float* output_data = output->mutable_data(ctx.GetPlace()); - - float reorder_shift = -scale_shift / scale_data; - - auto src_tz = phi::vectorize(input->dims()); - auto dst_tz = phi::vectorize(output->dims()); - dnnl::memory::data_type src_dt = paddle::framework::ToMKLDNNDataType( - framework::TransToProtoVarType(input->dtype())); - MKLDNNMemoryFormat src_fmt = input->format(); - - std::string key = - platform::CreateKey(dev_ctx, src_dt, src_tz, ctx.OutputName("Output")); - key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); - - const std::string key_prim = key + "@r"; - const std::string key_src_mem = key + "@s"; - const std::string key_dst_mem = key + "@d"; - - std::shared_ptr src_memory; - std::shared_ptr dst_memory; - std::shared_ptr reorder_p; - reorder_p = std::static_pointer_cast(dev_ctx.GetBlob(key_prim)); - - if (reorder_p == nullptr) { - dnnl::primitive_attr attri; - int mask = 0; - float reorder_scale = 1. / scale_data; - attri.set_output_scales(mask, {reorder_scale}); - - if (with_shift) { - dnnl::post_ops post_operations; - post_operations.append_sum(); - attri.set_post_ops(post_operations); - std::fill(output_data, output_data + output->numel(), reorder_shift); - } - - auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt); - src_memory = std::make_shared(src_md, engine, - to_void_cast(input_data)); - - auto dst_md = - platform::MKLDNNMemDesc({dst_tz}, memory::data_type::f32, - platform::MKLDNNFormatForSize( - dst_tz.size(), MKLDNNMemoryFormat::nchw)); - - dst_memory = std::make_shared( - dst_md, engine, to_void_cast(output_data)); - - auto reorder_pd = std::shared_ptr( - new reorder::primitive_desc(*src_memory, *dst_memory, attri)); - reorder_p = std::shared_ptr(new reorder(*reorder_pd)); - dev_ctx.SetBlob(key_prim, reorder_p); - dev_ctx.SetBlob(key_src_mem, src_memory); - dev_ctx.SetBlob(key_dst_mem, dst_memory); - } else { - src_memory = - std::static_pointer_cast(dev_ctx.GetBlob(key_src_mem)); - src_memory->set_data_handle(to_void_cast(input_data)); - - dst_memory = - std::static_pointer_cast(dev_ctx.GetBlob(key_dst_mem)); - if (with_shift) - std::fill(output_data, output_data + output->numel(), reorder_shift); - dst_memory->set_data_handle(output->mutable_data(ctx.GetPlace())); + + auto x_tz = phi::vectorize(x->dims()); + auto x_paddle_dtype = framework::TransToProtoVarType(x->dtype()); + auto out_paddle_dtype = framework::TransToProtoVarType(out->dtype()); + + dnnl::primitive_attr attrs; + static constexpr int32_t mask = 0; // same shift and scale for whole tensor + + const float reorder_scale = 1. / quantization_scale; + attrs.set_output_scales(mask, {reorder_scale}); + + if (with_shift) { + attrs.set_zero_points(DNNL_ARG_SRC, mask, + {static_cast(quantization_shift)}); } + platform::ReorderMKLDNNHandler reorder_handler( + x_tz, x_paddle_dtype, framework::ToMKLDNNDataType(x_paddle_dtype), + out_paddle_dtype, framework::ToMKLDNNDataType(out_paddle_dtype), + dev_ctx.GetEngine()); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + x->mem_desc(), platform::to_void_cast(x->data())); + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + out, x->mem_desc(), dev_ctx.GetPlace()); + + auto reorder_p = reorder_handler.AcquireReorder( + reorder_dst_memory_p, reorder_src_memory_p, attrs); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - reorder_p->execute(astream, *src_memory, *dst_memory); + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory)); + out->set_mem_desc(reorder_dst_memory_p->get_desc()); } }; diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc index 4cae3f0c737115..8cbe46bee481ab 100644 --- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "dnnl.hpp" +#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/quantize_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" @@ -34,83 +35,73 @@ template class QuantOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto scale_data = ctx.Attr("Scale"); - auto scale_shift = ctx.Attr("Shift"); - bool with_shift = scale_shift != 0.0f; - auto* output = ctx.Output("Output"); - - PADDLE_ENFORCE_NE( - scale_data, 0.0f, - platform::errors::InvalidArgument("Quantization scale cannot be 0.0")); - PADDLE_ENFORCE_GE(scale_shift, 0, - platform::errors::Unimplemented( - "Quantization shift must be nonnegative.")); - PADDLE_ENFORCE_LE( - scale_shift, 255, - platform::errors::Unimplemented( - "Quantization shift must be less than or equal to 255.")); + auto* x = ctx.Input("Input"); + auto* out = ctx.Output("Output"); + + const auto quantization_scale = ctx.Attr("Scale"); + const auto quantization_shift = ctx.Attr("Shift"); + const bool with_scale = quantization_scale != 1.0f; + const bool with_shift = quantization_shift != 0.0f; + + PADDLE_ENFORCE_NE(quantization_scale, 0.0f, + platform::errors::InvalidArgument( + "Quantization scale must be different than 0.0f")); + PADDLE_ENFORCE( + quantization_shift <= 255 && quantization_shift >= 0, + platform::errors::InvalidArgument( + "Quantization shift must be lower or equal to ", + "255 and greater or equal to 0, but got %f", quantization_shift)); auto& dev_ctx = ctx.template device_context(); - const auto& engine = dev_ctx.GetEngine(); - std::vector pipeline; - auto src_tz = phi::vectorize(input->dims()); - auto dst_tz = phi::vectorize(output->dims()); + auto x_tz = phi::vectorize(x->dims()); - const T* input_data = input->data(); + const bool is_negative_input = ctx.Attr("is_negative_input"); + const bool bfloat16 = ctx.Attr("bfloat16"); - bool is_negative_input = ctx.Attr("is_negative_input"); - bool bfloat16 = ctx.Attr("bfloat16"); + dnnl::primitive_attr attrs; + static constexpr int32_t mask = 0; - // TODO(jczaja): Refactor with Acquire API - std::shared_ptr src_memory; - std::shared_ptr dst_memory; - std::shared_ptr reorder_p; - - std::string out_layout = ctx.Attr("output_format"); - MKLDNNMemoryFormat out_format = - platform::data_format_to_memory_format(out_layout); - dnnl::primitive_attr attri; - int mask = 0; - attri.set_output_scales(mask, {scale_data}); + if (with_scale) { + attrs.set_output_scales(mask, {quantization_scale}); + } if (with_shift) { - dnnl::post_ops post_operations; - post_operations.append_sum(); - attri.set_post_ops(post_operations); - uint8_t* output_data = output->mutable_data(ctx.GetPlace()); - // memset casts scale_shift to unsigned char (uint8_t) internally - std::memset(output_data, scale_shift, output->numel()); + attrs.set_zero_points(DNNL_ARG_DST, mask, + {static_cast(quantization_shift)}); } - auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32, - input->format()); - src_memory = std::make_shared(src_md, engine, - to_void_cast(input_data)); + framework::proto::VarType::Type x_paddle_dtype = + framework::TransToProtoVarType(x->dtype()); + framework::proto::VarType::Type out_paddle_dtype; - std::shared_ptr dst_md; if (bfloat16) { - platform::SetDstMemoryQuantized( - ctx, output, dst_tz, engine, dst_md, dst_memory, out_format); + out_paddle_dtype = framework::proto::VarType::BF16; } else if (is_negative_input && !with_shift) { - platform::SetDstMemoryQuantized(ctx, output, dst_tz, engine, - dst_md, dst_memory, out_format); + out_paddle_dtype = framework::proto::VarType::INT8; } else { - platform::SetDstMemoryQuantized(ctx, output, dst_tz, engine, - dst_md, dst_memory, out_format); + out_paddle_dtype = framework::proto::VarType::UINT8; } - auto reorder_pd = std::shared_ptr( - new reorder::primitive_desc(*src_memory, *dst_memory, attri)); - reorder_p = std::shared_ptr(new reorder(*reorder_pd)); + + platform::ReorderMKLDNNHandler reorder_handler( + x_tz, x_paddle_dtype, framework::ToMKLDNNDataType(x_paddle_dtype), + out_paddle_dtype, framework::ToMKLDNNDataType(out_paddle_dtype), + dev_ctx.GetEngine()); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + x->mem_desc(), platform::to_void_cast(x->data())); + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + out, x->mem_desc(), dev_ctx.GetPlace()); + + auto reorder_p = reorder_handler.AcquireReorder( + reorder_dst_memory_p, reorder_src_memory_p, attrs); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - reorder_p->execute(astream, *src_memory, *dst_memory); + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory)); + out->set_mem_desc(reorder_dst_memory_p->get_desc()); } }; } // namespace operators diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index 867c5f212ba6c1..9d3b8e2407fbfb 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -34,6 +34,12 @@ cnnlCastDataType_t GetCastDataType(const VT::Type& src_type, return cast_type; } +cnnlCastDataType_t GetCastDataType(const DataType& src_type, + const DataType& dst_type) { + return GetCastDataType(framework::TransToProtoVarType(src_type), + framework::TransToProtoVarType(dst_type)); +} + bool MLUSupportsCast(const VT::Type& src_type, const VT::Type& dst_type) { for (auto it = MLU_SUPPORTED_CAST_TYPE.begin(); it != MLU_SUPPORTED_CAST_TYPE.end(); ++it) { @@ -2713,17 +2719,16 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_desc, output)); } -/* static */ void MLUCnnl::ScatterNd(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t indices_desc, - const void* indices, - const cnnlTensorDescriptor_t updates_desc, - const void* updates, - const cnnlTensorDescriptor_t output_desc, - void* output) { +/* static */ void MLUCnnl::ScatterNd( + const ExecutionContext& ctx, cnnlScatterNdMode_t mode, + const cnnlTensorDescriptor_t indices_desc, const void* indices, + const cnnlTensorDescriptor_t updates_desc, const void* updates, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output) { cnnlHandle_t handle = GetHandleFromCTX(ctx); - PADDLE_ENFORCE_MLU_SUCCESS(cnnlScatterNd(handle, indices_desc, indices, - updates_desc, updates, output_desc, - output)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlScatterNd_v2(handle, mode, indices_desc, indices, updates_desc, + updates, input_desc, input, output_desc, output)); } /* static */ void MLUCnnl::BitWise( @@ -2777,5 +2782,26 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { cnnlReciprocal(handle, input_desc, input, output_desc, output)); } +/* static */ void MLUCnnl::EmbeddingBackward( + const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq, + const cnnlTensorDescriptor_t indices_desc, const void* indices, + const cnnlTensorDescriptor_t diff_desc, const void* diff, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetEmbeddingBackwardWorkspaceSize( + handle, diff_desc, output_desc, scale_grad_by_freq, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlEmbeddingBackward( + handle, padding_idx, scale_grad_by_freq, indices_desc, indices, diff_desc, + diff, workspace_ptr, workspace_size, output_desc, output)); +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 24db6c760d78ab..f048ac7c5c3be0 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -175,6 +175,10 @@ const std::map, cnnlCastDataType_t> cnnlCastDataType_t GetCastDataType(const VT::Type& src_type, const VT::Type& dst_type); + +cnnlCastDataType_t GetCastDataType(const DataType& src_type, + const DataType& dst_type); + bool MLUSupportsCast(const VT::Type& src_type, const VT::Type& dst_type); cnnlDeviceType_t GetCnnlDev(int dev_ordinal); @@ -1202,11 +1206,13 @@ class MLUCnnl { const void* k, const int k_int, const cnnlTensorDescriptor_t output_desc, void* output); - static void ScatterNd(const ExecutionContext& ctx, + static void ScatterNd(const ExecutionContext& ctx, cnnlScatterNdMode_t mode, const cnnlTensorDescriptor_t indices_desc, const void* indices, const cnnlTensorDescriptor_t updates_desc, const void* updates, + const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t output_desc, void* output); static void BitWise(const ExecutionContext& ctx, @@ -1227,6 +1233,12 @@ class MLUCnnl { const void* input, const cnnlTensorDescriptor_t output_desc, void* output); + + static void EmbeddingBackward( + const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq, + const cnnlTensorDescriptor_t indices_desc, const void* indices, + const cnnlTensorDescriptor_t diff_desc, const void* diff, + const cnnlTensorDescriptor_t output_desc, void* output); }; template diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.h b/paddle/fluid/operators/pull_gpups_sparse_op.h index f721608cffb082..abfdb62ec34ac3 100644 --- a/paddle/fluid/operators/pull_gpups_sparse_op.h +++ b/paddle/fluid/operators/pull_gpups_sparse_op.h @@ -26,6 +26,7 @@ template static void PullGpuPSSparseFunctor(const framework::ExecutionContext &ctx) { auto inputs = ctx.MultiInput("Ids"); auto outputs = ctx.MultiOutput("Out"); + auto embedding_size_vec = ctx.Attr>("size"); const auto slot_size = inputs.size(); std::vector all_keys(slot_size); // GpuPSPS only supports float now @@ -44,7 +45,7 @@ static void PullGpuPSSparseFunctor(const framework::ExecutionContext &ctx) { #ifdef PADDLE_WITH_HETERPS auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance(); gpu_ps_ptr->PullSparse(ctx.GetPlace(), 0, all_keys, all_values, slot_lengths, - 0); + embedding_size_vec, 0); #endif } diff --git a/paddle/fluid/operators/unstack_op_mlu.cc b/paddle/fluid/operators/unstack_op_mlu.cc new file mode 100644 index 00000000000000..9c4dd256a94efe --- /dev/null +++ b/paddle/fluid/operators/unstack_op_mlu.cc @@ -0,0 +1,95 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +template +class UnStackMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *x = ctx.Input("X"); + auto out = ctx.MultiOutput("Y"); + int axis = ctx.Attr("axis"); + if (axis < 0) axis += x->dims().size(); + int num = x->dims()[axis]; + + std::vector out_descs; + std::vector out_raw_descs; + std::vector out_ptrs; + std::vector new_dims = phi::vectorize(x->dims()); + new_dims[axis] = 1; + for (int i = 0; i < num; i++) { + out[i]->mutable_data(ctx.GetPlace()); + out_descs.emplace_back(MLUCnnlTensorDesc(new_dims.size(), new_dims.data(), + ToCnnlDataType())); + out_raw_descs.push_back(out_descs.back().get()); + out_ptrs.push_back(GetBasePtr(out[i])); + } + + MLUCnnlTensorDesc x_desc(*x); + MLUCnnl::Split(ctx, num, axis, x_desc.get(), GetBasePtr(x), + out_raw_descs.data(), out_ptrs.data()); + } +}; + +template +class UnStackGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto x = ctx.MultiInput(framework::GradVarName("Y")); + auto *y = ctx.Output(framework::GradVarName("X")); + int axis = ctx.Attr("axis"); + if (axis < 0) axis += (x[0]->dims().size() + 1); + int num = static_cast(x.size()); + + std::vector x_descs; + std::vector x_raw_descs; + std::vector x_ptrs; + for (int i = 0; i < num; i++) { + if (x[i]->dims().size() != 0) { + std::vector in_dims = phi::vectorize(x[i]->dims()); + in_dims.insert(in_dims.begin() + axis, 1); + x_descs.emplace_back(MLUCnnlTensorDesc(in_dims.size(), in_dims.data(), + ToCnnlDataType())); + } else { + int input_dims = 1; + x_descs.emplace_back( + MLUCnnlTensorDesc(1, &input_dims, ToCnnlDataType())); + } + x_raw_descs.push_back(x_descs.back().get()); + x_ptrs.push_back(GetBasePtr(x[i])); + } + y->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc y_desc(*y); + MLUCnnl::Concat(ctx, num, axis, x_raw_descs.data(), x_ptrs.data(), + y_desc.get(), GetBasePtr(y)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace plat = paddle::platform; +namespace ops = paddle::operators; + +REGISTER_OP_MLU_KERNEL(unstack, ops::UnStackMLUKernel, + ops::UnStackMLUKernel); + +REGISTER_OP_MLU_KERNEL(unstack_grad, ops::UnStackGradMLUKernel, + ops::UnStackGradMLUKernel); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 12fa933701ef46..13b5005a30fa05 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -1057,6 +1057,14 @@ class ReorderMKLDNNHandler { return std::make_shared(*(src_memory_p), *(dst_memory_p)); } + std::shared_ptr AcquireReorder( + std::shared_ptr dst_memory_p, + std::shared_ptr src_memory_p, + const dnnl::primitive_attr& attrs) { + return std::make_shared(*(src_memory_p), *(dst_memory_p), + attrs); + } + private: std::vector dims_; framework::proto::VarType::Type vtype_, vtype_dst_; diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc index 5e2274cb651385..5aac6ada05b18a 100644 --- a/paddle/fluid/pybind/data_set_py.cc +++ b/paddle/fluid/pybind/data_set_py.cc @@ -298,6 +298,8 @@ void BindDataset(py::module *m) { py::call_guard()) .def("set_preload_thread_num", &framework::Dataset::SetPreLoadThreadNum, py::call_guard()) + .def("set_graph_device_keys", &framework::Dataset::SetGraphDeviceKeys, + py::call_guard()) .def("create_preload_readers", &framework::Dataset::CreatePreLoadReaders, py::call_guard()) .def("destroy_preload_readers", diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc index 4ffb513671c565..e2f4feebf9e3a5 100644 --- a/paddle/fluid/pybind/fleet_py.cc +++ b/paddle/fluid/pybind/fleet_py.cc @@ -178,13 +178,13 @@ void BindHeterClient(py::module* m) { void BindGraphNode(py::module* m) { py::class_(*m, "GraphNode") .def(py::init<>()) - .def("get_id", &GraphNode::get_id) + .def("get_id", &GraphNode::get_py_id) .def("get_feature", &GraphNode::get_feature); } void BindGraphPyFeatureNode(py::module* m) { py::class_(*m, "FeatureNode") .def(py::init<>()) - .def("get_id", &GraphNode::get_id) + .def("get_id", &GraphNode::get_py_id) .def("get_feature", &GraphNode::get_feature); } @@ -336,17 +336,27 @@ void BindGraphGpuWrapper(py::module* m) { *m, "GraphGpuWrapper") .def(py::init([]() { return GraphGpuWrapper::GetInstance(); })) .def("neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample_v3) - .def("graph_neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample) + .def("graph_neighbor_sample", + py::overload_cast( + &GraphGpuWrapper::graph_neighbor_sample)) + .def("graph_neighbor_sample", + py::overload_cast&, int>( + &GraphGpuWrapper::graph_neighbor_sample)) .def("set_device", &GraphGpuWrapper::set_device) + .def("set_feature_separator", &GraphGpuWrapper::set_feature_separator) .def("init_service", &GraphGpuWrapper::init_service) .def("set_up_types", &GraphGpuWrapper::set_up_types) .def("query_node_list", &GraphGpuWrapper::query_node_list) .def("add_table_feat_conf", &GraphGpuWrapper::add_table_feat_conf) .def("load_edge_file", &GraphGpuWrapper::load_edge_file) - .def("upload_batch", &GraphGpuWrapper::upload_batch) - .def("get_all_id", &GraphGpuWrapper::get_all_id) - .def("init_sample_status", &GraphGpuWrapper::init_sample_status) - .def("free_sample_status", &GraphGpuWrapper::free_sample_status) + .def("upload_batch", + py::overload_cast>&>( + &GraphGpuWrapper::upload_batch)) + .def("upload_batch", + py::overload_cast>&, int>( + &GraphGpuWrapper::upload_batch)) + .def("get_all_id", py::overload_cast(&GraphGpuWrapper::get_all_id)) + .def("get_all_id", py::overload_cast(&GraphGpuWrapper::get_all_id)) .def("load_next_partition", &GraphGpuWrapper::load_next_partition) .def("make_partitions", &GraphGpuWrapper::make_partitions) .def("make_complementary_graph", diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py index 76e617c7dafcf3..6112a9a1f45b6b 100755 --- a/python/paddle/distributed/passes/ps_trainer_pass.py +++ b/python/paddle/distributed/passes/ps_trainer_pass.py @@ -375,12 +375,12 @@ def dag_check_up_and_reorder(program, inputs, outputs): if attrs['use_ps_gpu']: _program.global_block()._insert_op( index=distributed_idx, - type="pull_box_sparse", + type="pull_gpups_sparse", inputs={"Ids": inputs, 'W': w}, outputs={"Out": outputs}, attrs={ - "size": w.shape[1], + "size": [w.shape[1] for i in inputs], "is_distributed": True, "is_sparse": True }) @@ -614,15 +614,24 @@ def _check_conflict(self, other_pass): return True def _add_push_box_sparse_op(self, program): + insert_index = -1 + for idx, op in list(enumerate(program.global_block().ops)): + if op.type == "lookup_table_grad": + insert_index = idx for op in program.global_block().ops: - if op.type != "pull_box_sparse": + if op.type != "pull_box_sparse" and op.type != "pull_gpups_sparse": continue grad_op_desc, op_grad_to_var = core.get_grad_op_desc( op.desc, cpt.to_text(set()), []) for op_desc in grad_op_desc: - new_op_desc = program.global_block().desc.append_op() + new_op_desc = program.global_block().desc._insert_op( + insert_index + 1) new_op_desc.copy_from(op_desc) new_op_desc._set_attr(op_role_attr_name, backward) + new_op = paddle.fluid.framework.Operator(program.global_block(), + new_op_desc) + program.global_block().ops.insert(insert_index + 1, new_op) + program.global_block()._sync_with_cpp() def _remove_optimizer_var(self, program): embedding_w = {} @@ -670,7 +679,7 @@ def _remove_lookup_table_grad_op_and_var(self, program): lookup_table_grad_var[name] = 1 for idx, op in list(enumerate(program.global_block().ops)): - if op.type == "pull_box_sparse": + if op.type == "pull_box_sparse" or op.type == "pull_gpups_sparse": continue for key_name in op.input_names: for var in op.input(key_name): diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index c6df7559a22e81..888d517116a15f 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -1013,12 +1013,13 @@ def sync_strategy_envs(): if self.context['ps_mode'] == DistributedMode.GEO: self._communicator.init_params(init_params) else: - if role_id == 0: - self._init_all_params(scopes, send_ctx, dense_map) + if not self.context['use_ps_gpu']: + if role_id == 0: + self._init_all_params(scopes, send_ctx, dense_map) fleet.util.barrier() - - self._pull_all_dense(scopes, send_ctx, dense_map) + if not self.context['use_ps_gpu']: + self._pull_all_dense(scopes, send_ctx, dense_map) fleet.util.barrier() if self.context['ps_mode'] == DistributedMode.GEO: diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index c73ea8b5b0e1a6..55b44309ff71a3 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -901,7 +901,7 @@ def shuffle_batch(x, seed=None): seed = helper.create_variable( name=unique_name.generate("shuffle_batch_seed"), dtype="int64", - persistable=True) + persistable=False) helper.append_op( type='shuffle_batch', inputs={'X': x, diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index 84064669c0dc67..70c7c0fb8c4382 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -1042,6 +1042,27 @@ def _set_heter_ps(self, enable_heter_ps=False): """ self.dataset.set_heter_ps(enable_heter_ps) + def set_graph_device_keys(self, device_keys): + """ + """ + self.dataset.set_graph_device_keys(device_keys) + + def set_graph_config(self, config): + """ + """ + self.proto_desc.graph_config.walk_degree = config.get("walk_degree", 1) + self.proto_desc.graph_config.walk_len = config.get("walk_len", 20) + self.proto_desc.graph_config.window = config.get("window", 5) + self.proto_desc.graph_config.once_sample_startid_len = config.get( + "once_sample_startid_len", 8000) + self.proto_desc.graph_config.sample_times_one_chunk = config.get( + "sample_times_one_chunk", 10) + self.proto_desc.graph_config.batch_size = config.get("batch_size", 1) + self.proto_desc.graph_config.debug_mode = config.get("debug_mode", 0) + self.proto_desc.graph_config.first_node_type = config.get( + "first_node_type", "") + self.proto_desc.graph_config.meta_path = config.get("meta_path", "") + class QueueDataset(DatasetBase): """ diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py index 2c09abac9e7ba8..51e89cc301cf30 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py @@ -293,12 +293,12 @@ def dag_check_up_and_reorder(program, inputs, outputs): if use_ps_gpu: program.global_block()._insert_op( index=distributed_idx, - type="pull_box_sparse", + type="pull_gpups_sparse", inputs={"Ids": inputs, 'W': w}, outputs={"Out": outputs}, attrs={ - "size": w.shape[1], + "size": [w.shape[1] for i in inputs], "is_distributed": True, "is_sparse": True }) @@ -576,7 +576,7 @@ def _add_push_box_sparse_op(program): op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName() backward = core.op_proto_and_checker_maker.OpRole.Backward for op in program.global_block().ops: - if op.type != "pull_box_sparse": + if op.type != "pull_box_sparse" and op.type != "pull_gpups_sparse": continue grad_op_desc, op_grad_to_var = core.get_grad_op_desc( op.desc, cpt.to_text(set()), []) @@ -599,7 +599,7 @@ def _remove_lookup_table_grad_op_and_var(program): lookup_table_grad_var[name] = 1 for idx, op in list(enumerate(program.global_block().ops)): - if op.type == "pull_box_sparse": + if op.type == "pull_box_sparse" or op.type == "pull_gpups_sparse": continue for key_name in op.input_names: for var in op.input(key_name): diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py index 40ff41fe89f47f..dd9d7e760a8e5e 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py @@ -103,9 +103,9 @@ def init_worker(self): # prepare for client to client communication if self._role_maker.is_worker(): info = self._fleet_ptr.get_clients_info() - print("IIIIFO: {}".format(info)) + print("Client Info: {}".format(info)) all_info = self._role_maker._worker_gather(info[0]) - print("ALL info: {}".format(all_info)) + print("All Client Info: {}".format(all_info)) self._fleet_ptr.gather_clients(all_info) self._fleet_ptr.set_client2client_config( self._client2client_request_timeout_ms, diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py index 8dfe9c32cd9734..5f0af296441fff 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py @@ -124,14 +124,15 @@ def add_sparse_table(self, table_id, strategy): support_accessor_class = [ 'DownpourFeatureValueAccessor', 'DownpourCtrAccessor', - 'DownpourSparseValueAccessor', 'DownpourCtrDoubleAccessor', - 'DownpourUnitAccessor', 'DownpourDoubleUnitAccessor' + 'DownpourCtrDymfAccessor', 'DownpourSparseValueAccessor', + 'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor', + 'DownpourDoubleUnitAccessor' ] if strategy.get('sparse_accessor_class') is not None: accessor_class = strategy.get('sparse_accessor_class') if accessor_class not in support_accessor_class: raise ValueError( - "support sparse_accessor_class: ['DownpourFeatureValueAccessor', 'DownpourCtrAccessor', \ + "support sparse_accessor_class: ['DownpourFeatureValueAccessor', 'DownpourCtrAccessor', 'DownpourCtrDymfAccessor', \ 'DownpourSparseValueAccessor', 'DownpourCtrDoubleAccessor'], \ but actual %s" % (accessor_class)) else: @@ -141,6 +142,7 @@ def add_sparse_table(self, table_id, strategy): if accessor_class == 'DownpourFeatureValueAccessor' \ or accessor_class == 'DownpourCtrAccessor' \ + or accessor_class == 'DownpourCtrDymfAccessor' \ or accessor_class == 'DownpourCtrDoubleAccessor': table.accessor.sparse_sgd_param.learning_rate = strategy.get( 'sparse_learning_rate', 0.05) diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py index 5d7dacc007e6b7..9483556d46f59c 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py @@ -339,6 +339,7 @@ def _check_config_fleet_with_program_op(self, strategy, table_name, # set sparse_embedx_dim in the strategy according to accessor and use_cvm config if accessor == "DownpourFeatureValueAccessor" \ or accessor == "DownpourCtrAccessor" \ + or accessor == "DownpourCtrDymfAccessor" \ or accessor == "DownpourDoubleUnitAccessor" \ or accessor == "DownpourUnitAccessor": if st.get("sparse_embedx_dim") is not None \ @@ -586,6 +587,7 @@ def _minimize(self, # set sparse_embedx_dim in strategy, # user do not have to set it in config_fleet if accessor == "DownpourFeatureValueAccessor" \ + or accessor == "DownpourCtrDymfAccessor" \ or accessor == "DownpourCtrAccessor" \ or accessor == "DownpourDoubleUnitAccessor" \ or accessor == "DownpourUnitAccessor": @@ -873,7 +875,8 @@ def _minimize(self, if server._server.downpour_server_param.downpour_table_param[ 0].accessor.accessor_class in [ "DownpourCtrAccessor", "DownpourCtrDoubleAccessor", - "DownpourUnitAccessor", "DownpourDoubleUnitAccessor" + "DownpourUnitAccessor", "DownpourDoubleUnitAccessor", + "DownpourCtrDymfAccessor" ]: opt_info["dump_slot"] = True elif server._server.downpour_server_param.downpour_table_param[ diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 799d93918f2efd..97506ead5fad4d 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -737,7 +737,7 @@ def _pull_gpups_sparse(input, for i in range(len(inputs)) ] w = helper.create_parameter( - attr=helper.param_attr, shape=[11], dtype=dtype, is_bias=False) + attr=helper.param_attr, shape=[size[0]], dtype=dtype, is_bias=False) helper.append_op( type='pull_gpups_sparse', inputs={'Ids': inputs, diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py index a0836c959c84b9..fae52ab833b9d4 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py @@ -17,6 +17,7 @@ import unittest import numpy as np from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16 +import paddle class TestDeQuantizeOp(OpTest): @@ -110,19 +111,6 @@ def set_data_type(self): self.data_type = 'uint16' -class TestDeQuantizeOp_ZeroScale(TestDeQuantizeOp): - def set_scale(self): - self.scale = 0.0 - - def prepare_output_int8(self): - self.output = np.zeros(self.input_size) - self.outputs = {'Output': self.output} - - def test_check_output(self): - self.assertRaises(AttributeError, self.check_raise_error, - 'Dequantization scale cannot be 0.0') - - # 2-dim input # P - positive input, with shift class TestDeQuantizeOpShift_2_P(TestDeQuantizeOp): @@ -177,28 +165,6 @@ def set_input_size(self): self.input_size = [2, 3, 4, 5] -class TestDeQuantizeOp_NegativeShift(TestDeQuantizeOp): - def set_shift(self): - self.shift = -10.0 - - def prepare_output_int8(self): - self.output = np.zeros(self.input_size) - self.outputs = {'Output': self.output} - - def test_check_output(self): - self.assertRaises(AttributeError, self.check_raise_error, - 'Dequantization shift must be nonnegative.') - - -class TestDeQuantizeOp_TooBigShift(TestDeQuantizeOp_NegativeShift): - def set_shift(self): - self.shift = 300.0 - - def test_check_output(self): - self.assertRaises( - AttributeError, self.check_raise_error, - 'Dequantization shift must be less than or equal to 255.') - - if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py index a7acc5f3f9bf32..c92d870565fbc9 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py @@ -17,6 +17,7 @@ import unittest import numpy as np from paddle.fluid.tests.unittests.op_test import OpTest +import paddle class TestQuantizeOp(OpTest): @@ -104,19 +105,6 @@ def set_is_negative(self): self.is_nagative = False -class TestQuantizeOp_ZeroScale(TestQuantizeOp): - def set_scale(self): - self.scale = 0.0 - - def prepare_output(self): - self.output = np.zeros(self.input_size) - self.outputs = {'Output': self.output} - - def test_check_output(self): - self.assertRaises(AttributeError, self.check_raise_error, - 'Quantization scale cannot be 0.0') - - # 2-dim input # P - positive input class TestQuantizeOpShift_NCHW_2_P(TestQuantizeOp): @@ -201,34 +189,6 @@ def set_output_format(self): self.output_format = 'NHWC' -class TestQuantizeOp_NegativeShift(TestQuantizeOp): - def set_is_negative(self): - self.is_nagative = False - - def set_scale(self): - self.scale = 100.0 - - def set_shift(self): - self.shift = -10.0 - - def prepare_output(self): - self.output = np.zeros(self.input_size) - self.outputs = {'Output': self.output} - - def test_check_output(self): - self.assertRaises(AttributeError, self.check_raise_error, - 'Quantization shift must be nonnegative.') - - -class TestQuantizeOp_TooBigShift(TestQuantizeOp_NegativeShift): - def set_shift(self): - self.shift = 300.0 - - def test_check_output(self): - self.assertRaises( - AttributeError, self.check_raise_error, - 'Quantization shift must be less than or equal to 255.') - - if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py new file mode 100644 index 00000000000000..f9a08ba4c9b146 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py @@ -0,0 +1,142 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2022 + + +class TestLookupTableV2(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "lookup_table_v2" + + self.init_dtype() + self.init_dims() + self.init_padding_idx() + np.random.seed(SEED) + w = np.random.random([self.vocab, self.dim]).astype(self.dtype) + x = np.random.randint( + 0, self.vocab, size=(self.bsz, self.seqlen)).astype(self.ids_dtype) + out = w[x] + if self.padding_idx != -1: + out[np.squeeze(x == self.padding_idx)] = np.zeros(self.dim) + + self.inputs = { + 'W': OpTest.np_dtype_to_fluid_dtype(w), + 'Ids': OpTest.np_dtype_to_fluid_dtype(x) + } + self.attrs = { + 'is_sparse': False, + 'is_distributed': False, + 'remote_prefetch': False, + 'padding_idx': self.padding_idx + } + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + self.ids_dtype = np.int32 + + def init_dims(self): + self.bsz = 6 + self.seqlen = 8 + self.vocab = 10 + # embedding_dim is not multiple of 32 + self.dim = 20 + + def init_padding_idx(self): + self.padding_idx = -1 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, ['W'], 'Out', max_relative_error=0.01) + else: + self.check_grad_with_place(self.place, ['W'], 'Out') + + +class TestLookupTableV2FP16(TestLookupTableV2): + no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + self.ids_dtype = np.int32 + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + self.__class__.no_need_check_grad = True + + +class TestLookupTableV2Dim32(TestLookupTableV2): + def init_dims(self): + self.bsz = 6 + self.seqlen = 8 + self.vocab = 10 + # embedding_dim is multiple of 32 + self.dim = 64 + + +class TestLookupTableV2Dim32FP16(TestLookupTableV2): + no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + self.ids_dtype = np.int64 + + def init_dims(self): + self.bsz = 6 + self.seqlen = 8 + self.vocab = 10 + self.dim = 64 + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + self.__class__.no_need_check_grad = True + + +class TestLookupTableV2WithPadding(TestLookupTableV2): + def init_padding_idx(self): + self.padding_idx = np.random.randint(0, self.vocab) + + +class TestLookupTableV2WithPadding1(TestLookupTableV2): + def init_padding_idx(self): + self.padding_idx = np.random.randint(0, self.vocab) + + def init_dtype(self): + self.dtype = np.float32 + self.ids_dtype = np.int64 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_unstack_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_unstack_op_mlu.py new file mode 100644 index 00000000000000..a75a6aa1dfcb92 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_unstack_op_mlu.py @@ -0,0 +1,97 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import sys +sys.path.append("..") +from op_test import OpTest +import unittest +import paddle + +paddle.enable_static() + + +class TestUnStackOpBase(OpTest): + def initDefaultParameters(self): + self.input_dim = (5, 6, 7) + self.axis = 0 + + def initParameters(self): + pass + + def get_y_names(self): + y_names = [] + for i in range(self.input_dim[self.axis]): + y_names.append('y{}'.format(i)) + return y_names + + def setUp(self): + self.initDefaultParameters() + self.initParameters() + self.op_type = 'unstack' + self.set_mlu() + self.init_dtype() + + self.x = np.random.random(size=self.input_dim).astype(self.dtype) + + outs = np.split(self.x, self.input_dim[self.axis], self.axis) + new_shape = list(self.input_dim) + del new_shape[self.axis] + y_names = self.get_y_names() + tmp = [] + for i in range(self.input_dim[self.axis]): + tmp.append((y_names[i], np.reshape(outs[i], new_shape))) + + self.inputs = {'X': self.x} + self.outputs = {'Y': tmp} + self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]} + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], self.get_y_names()) + + +class TestStackOp3(TestUnStackOpBase): + def initParameters(self): + self.axis = -1 + + +class TestStackOp4(TestUnStackOpBase): + def initParameters(self): + self.axis = -3 + + +class TestStackOp5(TestUnStackOpBase): + def initParameters(self): + self.axis = 1 + + +class TestStackOp6(TestUnStackOpBase): + def initParameters(self): + self.axis = 2 + + +if __name__ == '__main__': + unittest.main()