diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 5608b6f6f348b4..5d81e53a695bca 100755
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -227,3 +227,6 @@ endif(WITH_CRYPTO)
 if(WITH_CUSTOM_DEVICE AND NOT WIN32)
     add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE)
 endif()
+if(WITH_GPU_GRAPH)
+    add_definitions(-DPADDLE_WITH_GPU_GRAPH)
+endif()
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
index 8ff12265269b28..6a8bf9683bb3b2 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
@@ -143,10 +143,8 @@ int32_t GraphBrpcService::add_graph_node(Table *table,
 
   int idx_ = *(int *)(request.params(0).c_str());
   size_t node_num = request.params(1).size() / sizeof(int64_t);
-  int64_t *node_data = (int64_t *)(request.params(1).c_str());
-  // size_t node_num = request.params(0).size() / sizeof(int64_t);
-  // int64_t *node_data = (int64_t *)(request.params(0).c_str());
-  std::vector<int64_t> node_ids(node_data, node_data + node_num);
+  uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
   std::vector<bool> is_weighted_list;
   if (request.params_size() == 3) {
     size_t weight_list_size = request.params(2).size() / sizeof(bool);
@@ -177,11 +175,9 @@ int32_t GraphBrpcService::remove_graph_node(Table *table,
     return 0;
   }
   int idx_ = *(int *)(request.params(0).c_str());
-  size_t node_num = request.params(1).size() / sizeof(int64_t);
-  int64_t *node_data = (int64_t *)(request.params(1).c_str());
-  // size_t node_num = request.params(0).size() / sizeof(int64_t);
-  // int64_t *node_data = (int64_t *)(request.params(0).c_str());
-  std::vector<int64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(1).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
 
   ((GraphTable *)table)->remove_graph_node(idx_, node_ids);
   return 0;
@@ -215,11 +211,6 @@ int32_t GraphBrpcService::Initialize() {
       &GraphBrpcService::graph_set_node_feat;
   _service_handler_map[PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER] =
       &GraphBrpcService::sample_neighbors_across_multi_servers;
-  // _service_handler_map[PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE] =
-  //     &GraphBrpcService::use_neighbors_sample_cache;
-  // _service_handler_map[PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG] =
-  //     &GraphBrpcService::load_graph_split_config;
-  // shard初始化,server启动后才可从env获取到server_list的shard信息
   InitializeShardInfo();
 
   return 0;
@@ -384,9 +375,6 @@ int32_t GraphBrpcService::pull_graph_list(Table *table,
   int start = *(int *)(request.params(2).c_str());
   int size = *(int *)(request.params(3).c_str());
   int step = *(int *)(request.params(4).c_str());
-  // int start = *(int *)(request.params(0).c_str());
-  // int size = *(int *)(request.params(1).c_str());
-  // int step = *(int *)(request.params(2).c_str());
   std::unique_ptr<char[]> buffer;
   int actual_size;
   ((GraphTable *)table)
@@ -406,14 +394,10 @@ int32_t GraphBrpcService::graph_random_sample_neighbors(
     return 0;
   }
   int idx_ = *(int *)(request.params(0).c_str());
-  size_t node_num = request.params(1).size() / sizeof(int64_t);
-  int64_t *node_data = (int64_t *)(request.params(1).c_str());
-  int sample_size = *(int64_t *)(request.params(2).c_str());
+  size_t node_num = request.params(1).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
+  int sample_size = *(int *)(request.params(2).c_str());
   bool need_weight = *(bool *)(request.params(3).c_str());
-  // size_t node_num = request.params(0).size() / sizeof(int64_t);
-  // int64_t *node_data = (int64_t *)(request.params(0).c_str());
-  // int sample_size = *(int64_t *)(request.params(1).c_str());
-  // bool need_weight = *(bool *)(request.params(2).c_str());
   std::vector<std::shared_ptr<char>> buffers(node_num);
   std::vector<int> actual_sizes(node_num, 0);
   ((GraphTable *)table)
@@ -433,7 +417,7 @@ int32_t GraphBrpcService::graph_random_sample_nodes(
     brpc::Controller *cntl) {
   int type_id = *(int *)(request.params(0).c_str());
   int idx_ = *(int *)(request.params(1).c_str());
-  size_t size = *(int64_t *)(request.params(2).c_str());
+  size_t size = *(uint64_t *)(request.params(2).c_str());
   // size_t size = *(int64_t *)(request.params(0).c_str());
   std::unique_ptr<char[]> buffer;
   int actual_size;
@@ -459,11 +443,9 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table,
     return 0;
   }
   int idx_ = *(int *)(request.params(0).c_str());
-  size_t node_num = request.params(1).size() / sizeof(int64_t);
-  int64_t *node_data = (int64_t *)(request.params(1).c_str());
-  // size_t node_num = request.params(0).size() / sizeof(int64_t);
-  // int64_t *node_data = (int64_t *)(request.params(0).c_str());
-  std::vector<int64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(1).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
 
   std::vector<std::string> feature_names =
       paddle::string::split_string<std::string>(request.params(2), "\t");
@@ -497,22 +479,15 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
   }
 
   int idx_ = *(int *)(request.params(0).c_str());
-  size_t node_num = request.params(1).size() / sizeof(int64_t),
+  size_t node_num = request.params(1).size() / sizeof(uint64_t),
          size_of_size_t = sizeof(size_t);
-  int64_t *node_data = (int64_t *)(request.params(1).c_str());
-  int sample_size = *(int64_t *)(request.params(2).c_str());
-  bool need_weight = *(int64_t *)(request.params(3).c_str());
-
-  // size_t node_num = request.params(0).size() / sizeof(int64_t),
-  //        size_of_size_t = sizeof(size_t);
-  // int64_t *node_data = (int64_t *)(request.params(0).c_str());
-  // int sample_size = *(int64_t *)(request.params(1).c_str());
-  // bool need_weight = *(int64_t *)(request.params(2).c_str());
-  // std::vector<int64_t> res = ((GraphTable
-  // *)table).filter_out_non_exist_nodes(node_data, sample_size);
+  uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
+  int sample_size = *(int *)(request.params(2).c_str());
+  bool need_weight = *(bool *)(request.params(3).c_str());
+
   std::vector<int> request2server;
   std::vector<int> server2request(server_size, -1);
-  std::vector<int64_t> local_id;
+  std::vector<uint64_t> local_id;
   std::vector<int> local_query_idx;
   size_t rank = GetRank();
   for (int query_idx = 0; query_idx < node_num; ++query_idx) {
@@ -535,7 +510,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
   std::vector<std::shared_ptr<char>> local_buffers;
   std::vector<int> local_actual_sizes;
   std::vector<size_t> seq;
-  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   for (int query_idx = 0; query_idx < node_num; ++query_idx) {
     int server_index =
@@ -624,7 +599,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(int64_t) * node_num);
+                     sizeof(uint64_t) * node_num);
     closure->request(request_idx)
         ->add_params((char *)&sample_size, sizeof(int));
     closure->request(request_idx)
@@ -661,11 +636,9 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table,
   }
   int idx_ = *(int *)(request.params(0).c_str());
 
-  // size_t node_num = request.params(0).size() / sizeof(int64_t);
-  // int64_t *node_data = (int64_t *)(request.params(0).c_str());
-  size_t node_num = request.params(1).size() / sizeof(int64_t);
-  int64_t *node_data = (int64_t *)(request.params(1).c_str());
-  std::vector<int64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(1).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(1).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
 
   // std::vector<std::string> feature_names =
   //     paddle::string::split_string<std::string>(request.params(1), "\t");
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
index 55beb9b3932a62..892873d2294c49 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
@@ -81,14 +81,14 @@ class GraphPyService {
 
     graph_proto->set_table_name("cpu_graph_table");
     graph_proto->set_use_cache(false);
-    for (int i = 0; i < id_to_edge.size(); i++)
+    for (int i = 0; i < (int)id_to_edge.size(); i++)
       graph_proto->add_edge_types(id_to_edge[i]);
-    for (int i = 0; i < id_to_feature.size(); i++) {
+    for (int i = 0; i < (int)id_to_feature.size(); i++) {
       graph_proto->add_node_types(id_to_feature[i]);
       auto feat_node = id_to_feature[i];
       ::paddle::distributed::GraphFeature* g_f =
           graph_proto->add_graph_feature();
-      for (int x = 0; x < table_feat_conf_feat_name[i].size(); x++) {
+      for (int x = 0; x < (int)table_feat_conf_feat_name[i].size(); x++) {
         g_f->add_name(table_feat_conf_feat_name[i][x]);
         g_f->add_dtype(table_feat_conf_feat_dtype[i][x]);
         g_f->add_shape(table_feat_conf_feat_shape[i][x]);
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index 43dee275a3dc69..2d48d3e4e6449f 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -44,15 +44,86 @@ int32_t GraphTable::Load_to_ssd(const std::string &path,
   return 0;
 }
 
+paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
+    std::vector<uint64_t> &node_ids, int slot_num) {
+  std::vector<std::vector<uint64_t>> bags(task_pool_size_);
+  for (auto x : node_ids) {
+    int location = x % shard_num % task_pool_size_;
+    bags[location].push_back(x);
+  }
+  std::vector<std::future<int>> tasks;
+  std::vector<uint64_t> feature_array[task_pool_size_];
+  std::vector<uint8_t> slot_id_array[task_pool_size_];
+  std::vector<paddle::framework::GpuPsGraphFeaNode>
+      node_fea_array[task_pool_size_];
+  for (size_t i = 0; i < bags.size(); i++) {
+    if (bags[i].size() > 0) {
+      tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
+        paddle::framework::GpuPsGraphFeaNode x;
+        std::vector<uint64_t> feature_ids;
+        for (size_t j = 0; j < bags[i].size(); j++) {
+          // TODO use FEATURE_TABLE instead
+          Node *v = find_node(1, bags[i][j]);
+          x.node_id = bags[i][j];
+          if (v == NULL) {
+            x.feature_size = 0;
+            x.feature_offset = 0;
+            node_fea_array[i].push_back(x);
+          } else {
+            // x <- v
+            x.feature_offset = feature_array[i].size();
+            int total_feature_size = 0;
+            for (int k = 0; k < slot_num; ++k) {
+              v->get_feature_ids(k, &feature_ids);
+              total_feature_size += feature_ids.size();
+              if (!feature_ids.empty()) {
+                feature_array[i].insert(feature_array[i].end(),
+                                        feature_ids.begin(), feature_ids.end());
+                slot_id_array[i].insert(slot_id_array[i].end(),
+                                        feature_ids.size(), k);
+              }
+            }
+            x.feature_size = total_feature_size;
+            node_fea_array[i].push_back(x);
+          }
+        }
+        return 0;
+      }));
+    }
+  }
+  for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
+  paddle::framework::GpuPsCommGraphFea res;
+  uint64_t tot_len = 0;
+  for (int i = 0; i < task_pool_size_; i++) {
+    tot_len += feature_array[i].size();
+  }
+  VLOG(0) << "Loaded feature table on cpu, feature_list_size[" << tot_len
+          << "] node_ids_size[" << node_ids.size() << "]";
+  res.init_on_cpu(tot_len, (unsigned int)node_ids.size(), slot_num);
+  unsigned int offset = 0, ind = 0;
+  for (int i = 0; i < task_pool_size_; i++) {
+    for (int j = 0; j < (int)node_fea_array[i].size(); j++) {
+      res.node_list[ind] = node_fea_array[i][j];
+      res.node_list[ind++].feature_offset += offset;
+    }
+    for (size_t j = 0; j < feature_array[i].size(); j++) {
+      res.feature_list[offset + j] = feature_array[i][j];
+      res.slot_id_list[offset + j] = slot_id_array[i][j];
+    }
+    offset += feature_array[i].size();
+  }
+  return res;
+}
+
 paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
-    int idx, std::vector<int64_t> ids) {
-  std::vector<std::vector<int64_t>> bags(task_pool_size_);
+    int idx, std::vector<uint64_t> ids) {
+  std::vector<std::vector<uint64_t>> bags(task_pool_size_);
   for (auto x : ids) {
     int location = x % shard_num % task_pool_size_;
     bags[location].push_back(x);
   }
   std::vector<std::future<int>> tasks;
-  std::vector<int64_t> edge_array[task_pool_size_];
+  std::vector<uint64_t> edge_array[task_pool_size_];
   std::vector<paddle::framework::GpuPsGraphNode> node_array[task_pool_size_];
   for (size_t i = 0; i < bags.size(); i++) {
     if (bags[i].size() > 0) {
@@ -69,7 +140,7 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
             x.neighbor_size = v->get_neighbor_size();
             x.neighbor_offset = edge_array[i].size();
             node_array[i].push_back(x);
-            for (size_t k = 0; k < x.neighbor_size; k++) {
+            for (size_t k = 0; k < (size_t)x.neighbor_size; k++) {
               edge_array[i].push_back(v->get_neighbor_id(k));
             }
           }
@@ -84,10 +155,6 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
   for (int i = 0; i < task_pool_size_; i++) {
     tot_len += edge_array[i].size();
   }
-  // res.neighbor_size = tot_len;
-  // res.node_size = ids.size();
-  // res.neighbor_list = new int64_t[tot_len];
-  // res.node_list = new paddle::framework::GpuPsGraphNode[ids.size()];
   res.init_on_cpu(tot_len, ids.size());
   int64_t offset = 0, ind = 0;
   for (int i = 0; i < task_pool_size_; i++) {
@@ -103,55 +170,34 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
   return res;
 }
 
-int32_t GraphTable::add_node_to_ssd(int type_id, int idx, int64_t src_id,
+int32_t GraphTable::add_node_to_ssd(int type_id, int idx, uint64_t src_id,
                                     char *data, int len) {
   if (_db != NULL) {
-    char ch[sizeof(int) * 2 + sizeof(int64_t)];
+    char ch[sizeof(int) * 2 + sizeof(uint64_t)];
     memcpy(ch, &type_id, sizeof(int));
     memcpy(ch + sizeof(int), &idx, sizeof(int));
-    memcpy(ch + sizeof(int) * 2, &src_id, sizeof(int64_t));
+    memcpy(ch + sizeof(int) * 2, &src_id, sizeof(uint64_t));
     std::string str;
     if (_db->get(src_id % shard_num % task_pool_size_, ch,
-                 sizeof(int) * 2 + sizeof(int64_t), str) == 0) {
-      int64_t *stored_data = ((int64_t *)str.c_str());
-      int n = str.size() / sizeof(int64_t);
-      char *new_data = new char[n * sizeof(int64_t) + len];
-      memcpy(new_data, stored_data, n * sizeof(int64_t));
-      memcpy(new_data + n * sizeof(int64_t), data, len);
+                 sizeof(int) * 2 + sizeof(uint64_t), str) == 0) {
+      uint64_t *stored_data = ((uint64_t *)str.c_str());
+      int n = str.size() / sizeof(uint64_t);
+      char *new_data = new char[n * sizeof(uint64_t) + len];
+      memcpy(new_data, stored_data, n * sizeof(uint64_t));
+      memcpy(new_data + n * sizeof(uint64_t), data, len);
       _db->put(src_id % shard_num % task_pool_size_, ch,
-               sizeof(int) * 2 + sizeof(int64_t), (char *)new_data,
-               n * sizeof(int64_t) + len);
+               sizeof(int) * 2 + sizeof(uint64_t), (char *)new_data,
+               n * sizeof(uint64_t) + len);
       delete[] new_data;
     } else {
       _db->put(src_id % shard_num % task_pool_size_, ch,
-               sizeof(int) * 2 + sizeof(int64_t), (char *)data, len);
+               sizeof(int) * 2 + sizeof(uint64_t), (char *)data, len);
     }
-    // _db->flush(src_id % shard_num % task_pool_size_);
-    // std::string x;
-    // if (_db->get(src_id % shard_num % task_pool_size_, ch, sizeof(int64_t) +
-    // 2 * sizeof(int), x) ==0){
-    // VLOG(0)<<"put result";
-    // for(int i = 0;i < x.size();i+=8){
-    //   VLOG(0)<<"get an id "<<*((int64_t *)(x.c_str() + i));
-    // }
-    //}
-    // if(src_id == 429){
-    //   str = "";
-    //   _db->get(src_id % shard_num % task_pool_size_, ch,
-    //            sizeof(int) * 2 + sizeof(int64_t), str);
-    //   int64_t *stored_data = ((int64_t *)str.c_str());
-    //   int n = str.size() / sizeof(int64_t);
-    //   VLOG(0)<<"429 has "<<n<<"neighbors";
-    //   for(int i =0;i< n;i++){
-    //     VLOG(0)<<"get an id "<<*((int64_t *)(str.c_str() +
-    //     i*sizeof(int64_t)));
-    //   }
-    // }
   }
   return 0;
 }
 char *GraphTable::random_sample_neighbor_from_ssd(
-    int idx, int64_t id, int sample_size,
+    int idx, uint64_t id, int sample_size,
     const std::shared_ptr<std::mt19937_64> rng, int &actual_size) {
   if (_db == NULL) {
     actual_size = 0;
@@ -159,16 +205,16 @@ char *GraphTable::random_sample_neighbor_from_ssd(
   }
   std::string str;
   VLOG(2) << "sample ssd for key " << id;
-  char ch[sizeof(int) * 2 + sizeof(int64_t)];
+  char ch[sizeof(int) * 2 + sizeof(uint64_t)];
   memset(ch, 0, sizeof(int));
   memcpy(ch + sizeof(int), &idx, sizeof(int));
-  memcpy(ch + sizeof(int) * 2, &id, sizeof(int64_t));
+  memcpy(ch + sizeof(int) * 2, &id, sizeof(uint64_t));
   if (_db->get(id % shard_num % task_pool_size_, ch,
-               sizeof(int) * 2 + sizeof(int64_t), str) == 0) {
-    int64_t *data = ((int64_t *)str.c_str());
-    int n = str.size() / sizeof(int64_t);
+               sizeof(int) * 2 + sizeof(uint64_t), str) == 0) {
+    uint64_t *data = ((uint64_t *)str.c_str());
+    int n = str.size() / sizeof(uint64_t);
     std::unordered_map<int, int> m;
-    // std::vector<int64_t> res;
+    // std::vector<uint64_t> res;
     int sm_size = std::min(n, sample_size);
     actual_size = sm_size * Node::id_size;
     char *buff = new char[actual_size];
@@ -192,7 +238,7 @@ char *GraphTable::random_sample_neighbor_from_ssd(
       // res.push_back(data[pos]);
     }
     for (int i = 0; i < actual_size; i += 8) {
-      VLOG(2) << "sampled an neighbor " << *(int64_t *)&buff[i];
+      VLOG(2) << "sampled an neighbor " << *(uint64_t *)&buff[i];
     }
     return buff;
   }
@@ -201,8 +247,8 @@ char *GraphTable::random_sample_neighbor_from_ssd(
 }
 
 int64_t GraphTable::load_graph_to_memory_from_ssd(int idx,
-                                                  std::vector<int64_t> &ids) {
-  std::vector<std::vector<int64_t>> bags(task_pool_size_);
+                                                  std::vector<uint64_t> &ids) {
+  std::vector<std::vector<uint64_t>> bags(task_pool_size_);
   for (auto x : ids) {
     int location = x % shard_num % task_pool_size_;
     bags[location].push_back(x);
@@ -213,17 +259,17 @@ int64_t GraphTable::load_graph_to_memory_from_ssd(int idx,
     if (bags[i].size() > 0) {
       tasks.push_back(_shards_task_pool[i]->enqueue([&, i, idx, this]() -> int {
 
-        char ch[sizeof(int) * 2 + sizeof(int64_t)];
+        char ch[sizeof(int) * 2 + sizeof(uint64_t)];
         memset(ch, 0, sizeof(int));
         memcpy(ch + sizeof(int), &idx, sizeof(int));
         for (size_t k = 0; k < bags[i].size(); k++) {
           auto v = bags[i][k];
-          memcpy(ch + sizeof(int) * 2, &v, sizeof(int64_t));
+          memcpy(ch + sizeof(int) * 2, &v, sizeof(uint64_t));
           std::string str;
-          if (_db->get(i, ch, sizeof(int) * 2 + sizeof(int64_t), str) == 0) {
+          if (_db->get(i, ch, sizeof(int) * 2 + sizeof(uint64_t), str) == 0) {
             count[i] += (int64_t)str.size();
-            for (int j = 0; j < str.size(); j += sizeof(int64_t)) {
-              int64_t id = *(int64_t *)(str.c_str() + j);
+            for (int j = 0; j < (int)str.size(); j += sizeof(uint64_t)) {
+              uint64_t id = *(uint64_t *)(str.c_str() + j);
               add_comm_edge(idx, v, id);
             }
           }
@@ -260,7 +306,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
   std::vector<double> weight_cost(part_len, 0);
   std::vector<int64_t> memory_remaining(part_len, gb_size_by_discount);
   std::vector<double> score(part_len, 0);
-  std::unordered_map<int64_t, int> id_map;
+  std::unordered_map<uint64_t, int> id_map;
   std::vector<rocksdb::Iterator *> iters;
   for (int i = 0; i < task_pool_size_; i++) {
     iters.push_back(_db->get_iterator(i));
@@ -268,7 +314,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
   }
   int next = 0;
   while (iters.size()) {
-    if (next >= iters.size()) {
+    if (next >= (int)iters.size()) {
       next = 0;
     }
     if (!iters[next]->Valid()) {
@@ -284,7 +330,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
       continue;
     }
     std::string value = iters[next]->value().ToString();
-    std::int64_t i_key = *(int64_t *)(key.c_str() + sizeof(int) * 2);
+    std::uint64_t i_key = *(uint64_t *)(key.c_str() + sizeof(int) * 2);
     for (int i = 0; i < part_len; i++) {
       if (memory_remaining[i] < (int64_t)value.size()) {
         score[i] = -100000.0;
@@ -292,8 +338,8 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
         score[i] = 0;
       }
     }
-    for (int j = 0; j < value.size(); j += sizeof(int64_t)) {
-      int64_t v = *((int64_t *)(value.c_str() + j));
+    for (int j = 0; j < (int)value.size(); j += sizeof(uint64_t)) {
+      uint64_t v = *((uint64_t *)(value.c_str() + j));
       int index = -1;
       if (id_map.find(v) != id_map.end()) {
         index = id_map[v];
@@ -413,8 +459,6 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path,
   auto paths = paddle::string::split_string<std::string>(path, ";");
   int64_t count = 0;
   std::string sample_type = "random";
-  bool is_weighted = false;
-  int valid_count = 0;
   for (auto path : paths) {
     std::ifstream file(path);
     std::string line;
@@ -425,13 +469,13 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path,
       if (values.size() < 2) continue;
       auto src_id = std::stoll(values[0]);
       auto dist_ids = paddle::string::split_string<std::string>(values[1], ";");
-      std::vector<int64_t> dist_data;
+      std::vector<uint64_t> dist_data;
       for (auto x : dist_ids) {
         dist_data.push_back(std::stoll(x));
-        total_memory_cost += sizeof(int64_t);
+        total_memory_cost += sizeof(uint64_t);
       }
       add_node_to_ssd(0, idx, src_id, (char *)dist_data.data(),
-                      (int)(dist_data.size() * sizeof(int64_t)));
+                      (int)(dist_data.size() * sizeof(uint64_t)));
     }
   }
   VLOG(0) << "total memory cost = " << total_memory_cost << " bytes";
@@ -440,9 +484,6 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path,
 
 int32_t GraphTable::dump_edges_to_ssd(int idx) {
   VLOG(2) << "calling dump edges to ssd";
-  const int64_t fixed_size = 10000;
-  // std::vector<int64_t> edge_array[task_pool_size_];
-  std::vector<std::unordered_map<int64_t, int>> count(task_pool_size_);
   std::vector<std::future<int64_t>> tasks;
   auto &shards = edge_shards[idx];
   for (size_t i = 0; i < shards.size(); ++i) {
@@ -450,15 +491,14 @@ int32_t GraphTable::dump_edges_to_ssd(int idx) {
         [&, i, this]() -> int64_t {
           int64_t cost = 0;
           std::vector<Node *> &v = shards[i]->get_bucket();
-          size_t ind = i % this->task_pool_size_;
           for (size_t j = 0; j < v.size(); j++) {
-            std::vector<int64_t> s;
-            for (int k = 0; k < v[j]->get_neighbor_size(); k++) {
+            std::vector<uint64_t> s;
+            for (int k = 0; k < (int)v[j]->get_neighbor_size(); k++) {
               s.push_back(v[j]->get_neighbor_id(k));
             }
-            cost += v[j]->get_neighbor_size() * sizeof(int64_t);
+            cost += v[j]->get_neighbor_size() * sizeof(uint64_t);
             add_node_to_ssd(0, idx, v[j]->get_id(), (char *)s.data(),
-                            s.size() * sizeof(int64_t));
+                            s.size() * sizeof(uint64_t));
           }
           return cost;
         }));
@@ -470,7 +510,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
   VLOG(0) << "make_complementary_graph";
   const int64_t fixed_size = byte_size / 8;
   // std::vector<int64_t> edge_array[task_pool_size_];
-  std::vector<std::unordered_map<int64_t, int>> count(task_pool_size_);
+  std::vector<std::unordered_map<uint64_t, int>> count(task_pool_size_);
   std::vector<std::future<int>> tasks;
   auto &shards = edge_shards[idx];
   for (size_t i = 0; i < shards.size(); ++i) {
@@ -480,7 +520,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
           size_t ind = i % this->task_pool_size_;
           for (size_t j = 0; j < v.size(); j++) {
             // size_t location = v[j]->get_id();
-            for (int k = 0; k < v[j]->get_neighbor_size(); k++) {
+            for (size_t k = 0; k < v[j]->get_neighbor_size(); k++) {
               count[ind][v[j]->get_neighbor_id(k)]++;
             }
           }
@@ -488,9 +528,9 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
         }));
   }
   for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
-  std::unordered_map<int64_t, int> final_count;
-  std::map<int, std::vector<int64_t>> count_to_id;
-  std::vector<int64_t> buffer;
+  std::unordered_map<uint64_t, int> final_count;
+  std::map<int, std::vector<uint64_t>> count_to_id;
+  std::vector<uint64_t> buffer;
   clear_graph(idx);
 
   for (int i = 0; i < task_pool_size_; i++) {
@@ -527,6 +567,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
       bucket[i]->build_sampler(sample_type);
     }
   }
+
   return 0;
 }
 #endif
@@ -821,7 +862,7 @@ std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
 
 size_t GraphShard::get_size() { return bucket.size(); }
 
-int32_t GraphTable::add_comm_edge(int idx, int64_t src_id, int64_t dst_id) {
+int32_t GraphTable::add_comm_edge(int idx, uint64_t src_id, uint64_t dst_id) {
   size_t src_shard_id = src_id % shard_num;
 
   if (src_shard_id >= shard_end || src_shard_id < shard_start) {
@@ -832,11 +873,11 @@ int32_t GraphTable::add_comm_edge(int idx, int64_t src_id, int64_t dst_id) {
   edge_shards[idx][index]->add_neighbor(src_id, dst_id, 1.0);
   return 0;
 }
-int32_t GraphTable::add_graph_node(int idx, std::vector<int64_t> &id_list,
+int32_t GraphTable::add_graph_node(int idx, std::vector<uint64_t> &id_list,
                                    std::vector<bool> &is_weight_list) {
   auto &shards = edge_shards[idx];
   size_t node_size = id_list.size();
-  std::vector<std::vector<std::pair<int64_t, bool>>> batch(task_pool_size_);
+  std::vector<std::vector<std::pair<uint64_t, bool>>> batch(task_pool_size_);
   for (size_t i = 0; i < node_size; i++) {
     size_t shard_id = id_list[i] % shard_num;
     if (shard_id >= shard_end || shard_id < shard_start) {
@@ -861,9 +902,9 @@ int32_t GraphTable::add_graph_node(int idx, std::vector<int64_t> &id_list,
   return 0;
 }
 
-int32_t GraphTable::remove_graph_node(int idx, std::vector<int64_t> &id_list) {
+int32_t GraphTable::remove_graph_node(int idx, std::vector<uint64_t> &id_list) {
   size_t node_size = id_list.size();
-  std::vector<std::vector<int64_t>> batch(task_pool_size_);
+  std::vector<std::vector<uint64_t>> batch(task_pool_size_);
   for (size_t i = 0; i < node_size; i++) {
     size_t shard_id = id_list[i] % shard_num;
     if (shard_id >= shard_end || shard_id < shard_start) continue;
@@ -896,7 +937,7 @@ void GraphShard::clear() {
 
 GraphShard::~GraphShard() { clear(); }
 
-void GraphShard::delete_node(int64_t id) {
+void GraphShard::delete_node(uint64_t id) {
   auto iter = node_location.find(id);
   if (iter == node_location.end()) return;
   int pos = iter->second;
@@ -908,7 +949,7 @@ void GraphShard::delete_node(int64_t id) {
   node_location.erase(id);
   bucket.pop_back();
 }
-GraphNode *GraphShard::add_graph_node(int64_t id) {
+GraphNode *GraphShard::add_graph_node(uint64_t id) {
   if (node_location.find(id) == node_location.end()) {
     node_location[id] = bucket.size();
     bucket.push_back(new GraphNode(id));
@@ -924,19 +965,25 @@ GraphNode *GraphShard::add_graph_node(Node *node) {
   }
   return (GraphNode *)bucket[node_location[id]];
 }
-FeatureNode *GraphShard::add_feature_node(int64_t id) {
+
+FeatureNode *GraphShard::add_feature_node(uint64_t id, bool is_overlap) {
   if (node_location.find(id) == node_location.end()) {
     node_location[id] = bucket.size();
     bucket.push_back(new FeatureNode(id));
+    return (FeatureNode *)bucket[node_location[id]];
   }
-  return (FeatureNode *)bucket[node_location[id]];
+  if (is_overlap) {
+    return (FeatureNode *)bucket[node_location[id]];
+  }
+
+  return NULL;
 }
 
-void GraphShard::add_neighbor(int64_t id, int64_t dst_id, float weight) {
+void GraphShard::add_neighbor(uint64_t id, uint64_t dst_id, float weight) {
   find_node(id)->add_edge(dst_id, weight);
 }
 
-Node *GraphShard::find_node(int64_t id) {
+Node *GraphShard::find_node(uint64_t id) {
   auto iter = node_location.find(id);
   return iter == node_location.end() ? nullptr : bucket[iter->second];
 }
@@ -974,11 +1021,11 @@ int32_t GraphTable::Load(const std::string &path, const std::string &param) {
 
 int32_t GraphTable::get_nodes_ids_by_ranges(
     int type_id, int idx, std::vector<std::pair<int, int>> ranges,
-    std::vector<int64_t> &res) {
+    std::vector<uint64_t> &res) {
   int start = 0, end, index = 0, total_size = 0;
   res.clear();
   auto &shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
-  std::vector<std::future<std::vector<int64_t>>> tasks;
+  std::vector<std::future<std::vector<uint64_t>>> tasks;
   for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) {
     end = total_size + shards[i]->get_size();
     start = total_size;
@@ -994,7 +1041,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
         first -= total_size;
         second -= total_size;
         tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
-            [&shards, this, first, second, i]() -> std::vector<int64_t> {
+            [&shards, this, first, second, i]() -> std::vector<uint64_t> {
               return shards[i]->get_ids_by_range(first, second);
             }));
       }
@@ -1011,10 +1058,11 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
   return 0;
 }
 
+// TODO opt load all node_types in once reading
 int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
   auto paths = paddle::string::split_string<std::string>(path, ";");
-  int64_t count = 0;
-  int64_t valid_count = 0;
+  uint64_t count = 0;
+  uint64_t valid_count = 0;
   int idx = 0;
   if (node_type == "") {
     VLOG(0) << "node_type not specified, loading edges to " << id_to_feature[0]
@@ -1027,53 +1075,47 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
     }
     idx = feature_to_id[node_type];
   }
-  for (auto path : paths) {
-    std::ifstream file(path);
-    std::string line;
-    while (std::getline(file, line)) {
-      auto values = paddle::string::split_string<std::string>(line, "\t");
-      if (values.size() < 2) continue;
-      auto id = std::stoull(values[1]);
-
-      size_t shard_id = id % shard_num;
-      if (shard_id >= shard_end || shard_id < shard_start) {
-        VLOG(4) << "will not load " << id << " from " << path
+  
+  VLOG(0) << "Begin GraphTable::load_nodes() node_type[" << node_type << "]";
+  std::vector<std::future<int>> tasks;
+  for (size_t i = 0; i < paths.size(); i++) {
+    tasks.push_back(load_node_edge_task_pool[i % load_thread_num]->enqueue(
+        [&, i, idx, this]() -> int {
+      VLOG(0) << "Begin GraphTable::load_nodes(), path[" << paths[i] << "]";
+      std::ifstream file(paths[i]);
+      std::string line;
+      uint64_t local_count = 0;
+      while (std::getline(file, line)) {
+        auto values = paddle::string::split_string<std::string>(line, "\t");
+        if (values.size() < 2) continue;
+        if (values[0] != node_type) {
+          continue;
+        }
+        
+        auto id = std::stoull(values[1]);
+        size_t shard_id = id % shard_num;
+        if (shard_id >= shard_end || shard_id < shard_start) {
+          VLOG(4) << "will not load " << id << " from " << path
                 << ", please check id distribution";
-        continue;
-      }
-
-      if (count % 1000000 == 0) {
-        VLOG(0) << count << " nodes are loaded from filepath";
-        VLOG(0) << line;
-      }
-      count++;
-
-      std::string nt = values[0];
-      if (nt != node_type) {
-        continue;
-      }
-
-      size_t index = shard_id - shard_start;
-
-      // auto node = shards[index]->add_feature_node(id);
-      auto node = feature_shards[idx][index]->add_feature_node(id);
-      node->set_feature_size(feat_name[idx].size());
+          continue;
+        }
 
-      for (size_t slice = 2; slice < values.size(); slice++) {
-        auto feat = this->parse_feature(idx, values[slice]);
-        if (feat.first >= 0) {
-          node->set_feature(feat.first, feat.second);
-        } else {
-          VLOG(4) << "Node feature:  " << values[slice]
-                  << " not in feature_map.";
+        size_t index = shard_id - shard_start;
+        auto node = feature_shards[idx][index]->add_feature_node(id, false);
+        if (node != NULL) {
+          node->set_feature_size(feat_name[idx].size());
+          for (size_t slice = 2; slice < values.size(); slice++) {
+            parse_feature(idx, values[slice], node);
+          }
+          local_count++;
         }
       }
-      valid_count++;
-    }
+      VLOG(0) << "node_type[" << node_type << "] loads " << local_count << " nodes from filepath->" << paths[i];
+      return 0;
+    }));
   }
-
-  VLOG(0) << valid_count << "/" << count << " nodes in type " << node_type
-          << " are loaded successfully in " << path;
+  for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
+  VLOG(0) << "successfully load all node_type[" << node_type << "] data";
   return 0;
 }
 
@@ -1089,9 +1131,8 @@ int32_t GraphTable::build_sampler(int idx, std::string sample_type) {
 int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge,
                                const std::string &edge_type) {
 #ifdef PADDLE_WITH_HETERPS
-  // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
   if (search_level == 2) total_memory_cost = 0;
-  const int64_t fixed_load_edges = 1000000;
+  const uint64_t fixed_load_edges = 1000000;
 #endif
   int idx = 0;
   if (edge_type == "") {
@@ -1107,104 +1148,125 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge,
   }
 
   auto paths = paddle::string::split_string<std::string>(path, ";");
-  int64_t count = 0;
+  uint64_t count = 0;
   std::string sample_type = "random";
   bool is_weighted = false;
-  int valid_count = 0;
-  for (auto path : paths) {
-    std::ifstream file(path);
-    std::string line;
-    while (std::getline(file, line)) {
-      auto values = paddle::string::split_string<std::string>(line, "\t");
-      count++;
-      if (values.size() < 2) continue;
-      auto src_id = std::stoull(values[0]);
-      auto dst_id = std::stoull(values[1]);
-      if (reverse_edge) {
-        std::swap(src_id, dst_id);
-      }
-      float weight = 1;
-      if (values.size() == 3) {
-        weight = std::stof(values[2]);
-        sample_type = "weighted";
-        is_weighted = true;
-      }
-
-      size_t src_shard_id = src_id % shard_num;
+  
+  VLOG(0) << "Begin GraphTable::load_edges() edge_type[" << edge_type << "]";
+  std::vector<std::future<int>> tasks;
+  for (int i = 0; i < paths.size(); i++) {
+    tasks.push_back(load_node_edge_task_pool[i % load_thread_num]->enqueue(
+        [&, i, idx, this]() -> int {
+      uint64_t local_count = 0;
+      std::ifstream file(paths[i]);
+      std::string line;
+      auto path_split = paddle::string::split_string<std::string>(paths[i], "/");
+      auto part_name_split = paddle::string::split_string<std::string>(path_split[path_split.size() - 1], "-");
+      auto part_num = std::stoull(part_name_split[part_name_split.size() - 1]);
+
+      while (std::getline(file, line)) {
+        auto values = paddle::string::split_string<std::string>(line, "\t");
+        local_count++;
+        if (values.size() < 2) continue;
+        auto src_id = std::stoull(values[0]);
+        auto dst_id = std::stoull(values[1]);
+        if (reverse_edge) {
+          std::swap(src_id, dst_id);
+        }
+        size_t src_shard_id = src_id % shard_num;
+        if (src_shard_id != (part_num % shard_num)) {
+          continue;
+        }
+     
+        float weight = 1;
+        if (values.size() == 3) {
+          weight = std::stof(values[2]);
+          sample_type = "weighted";
+          is_weighted = true;
+        }
 
-      if (src_shard_id >= shard_end || src_shard_id < shard_start) {
-        VLOG(4) << "will not load " << src_id << " from " << path
+        if (src_shard_id >= shard_end || src_shard_id < shard_start) {
+          VLOG(4) << "will not load " << src_id << " from " << path
                 << ", please check id distribution";
-        continue;
-      }
-
-      if (count % 1000000 == 0) {
-        VLOG(0) << count << " edges are loaded from filepath";
-        VLOG(0) << line;
+          continue;
+        }
+      
+        size_t index = src_shard_id - shard_start;
+        edge_shards[idx][index]->add_graph_node(src_id)->build_edges(is_weighted);
+        edge_shards[idx][index]->add_neighbor(src_id, dst_id, weight);
       }
-
-      size_t index = src_shard_id - shard_start;
-      edge_shards[idx][index]->add_graph_node(src_id)->build_edges(is_weighted);
-      edge_shards[idx][index]->add_neighbor(src_id, dst_id, weight);
-      valid_count++;
 #ifdef PADDLE_WITH_HETERPS
-      // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
-      if (count > fixed_load_edges && search_level == 2) {
+      if (search_level == 2) {
         dump_edges_to_ssd(idx);
         VLOG(0) << "dumping edges to ssd, edge count is reset to 0";
         clear_graph(idx);
-        count = 0;
       }
 #endif
-    }
+      VLOG(0) << local_count << " edges are loaded from filepath->" << paths[i];
+      return 0;
+    }));
   }
-  VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in "
-          << path;
+  for (int j = 0; j < (int)tasks.size(); j++) tasks[j].get();
+  VLOG(0) << "successfully load all edge_type[" << edge_type << "] data";
 
-// Build Sampler j
-#ifdef PADDLE_WITH_HETERPS
-  // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
-  if (search_level == 2) {
-    if (count > 0) {
-      dump_edges_to_ssd(idx);
-      VLOG(0) << "dumping edges to ssd, edge count is reset to 0";
-      clear_graph(idx);
-      count = 0;
-    }
-    return 0;
-  }
-#endif
+#ifdef PADDLE_WITH_GPU_GRAPH
+  // To reduce memory overhead, CPU samplers won't be created in gpugraph.
+  // In order not to affect the sampler function of other scenario,
+  // this optimization is only performed in load_edges function.
+  VLOG(0) << "run in gpugraph mode!";
+#else
+  VLOG(0) << "build sampler ... ";
   for (auto &shard : edge_shards[idx]) {
     auto bucket = shard->get_bucket();
     for (size_t i = 0; i < bucket.size(); i++) {
       bucket[i]->build_sampler(sample_type);
     }
   }
-
+#endif
   return 0;
 }
+ 
+Node *GraphTable::find_node(int type_id, uint64_t id) {
+  size_t shard_id = id % shard_num;
+  if (shard_id >= shard_end || shard_id < shard_start) {
+    return nullptr;
+  }
+  Node *node = nullptr;
+  size_t index = shard_id - shard_start;
+  auto &search_shards = type_id == 0 ? edge_shards : feature_shards;
+  for (auto& search_shard: search_shards) {
+      PADDLE_ENFORCE_NOT_NULL(search_shard[index]);
+      node = search_shard[index]->find_node(id);
+      if (node != nullptr) {
+          break;
+      }
+  }
+  return node;
+}
 
-Node *GraphTable::find_node(int type_id, int idx, int64_t id) {
+Node *GraphTable::find_node(int type_id, int idx, uint64_t id) {
   size_t shard_id = id % shard_num;
   if (shard_id >= shard_end || shard_id < shard_start) {
     return nullptr;
   }
   size_t index = shard_id - shard_start;
   auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
+  PADDLE_ENFORCE_NOT_NULL(search_shards[index]);
   Node *node = search_shards[index]->find_node(id);
   return node;
 }
-uint32_t GraphTable::get_thread_pool_index(int64_t node_id) {
+uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
   return node_id % shard_num % shard_num_per_server % task_pool_size_;
 }
 
-uint32_t GraphTable::get_thread_pool_index_by_shard_index(int64_t shard_index) {
+uint32_t GraphTable::get_thread_pool_index_by_shard_index(
+    uint64_t shard_index) {
   return shard_index % shard_num_per_server % task_pool_size_;
 }
 
 int32_t GraphTable::clear_nodes(int type_id, int idx) {
   auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
-  for (int i = 0; i < search_shards.size(); i++) {
+  for (size_t i = 0; i < search_shards.size(); i++) {
     search_shards[i]->clear();
   }
   return 0;
@@ -1268,16 +1330,16 @@ int32_t GraphTable::random_sample_nodes(int type_id, int idx, int sample_size,
     }
   }
   for (auto &pair : first_half) second_half.push_back(pair);
-  std::vector<int64_t> res;
+  std::vector<uint64_t> res;
   get_nodes_ids_by_ranges(type_id, idx, second_half, res);
-  actual_size = res.size() * sizeof(int64_t);
+  actual_size = res.size() * sizeof(uint64_t);
   buffer.reset(new char[actual_size]);
   char *pointer = buffer.get();
   memcpy(pointer, res.data(), actual_size);
   return 0;
 }
 int32_t GraphTable::random_sample_neighbors(
-    int idx, int64_t *node_ids, int sample_size,
+    int idx, uint64_t *node_ids, int sample_size,
     std::vector<std::shared_ptr<char>> &buffers, std::vector<int> &actual_sizes,
     bool need_weight) {
   size_t node_num = buffers.size();
@@ -1295,7 +1357,7 @@ int32_t GraphTable::random_sample_neighbors(
   for (int i = 0; i < (int)seq_id.size(); i++) {
     if (seq_id[i].size() == 0) continue;
     tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
-      int64_t node_id;
+      uint64_t node_id;
       std::vector<std::pair<SampleKey, SampleResult>> r;
       LRUResponse response = LRUResponse::blocked;
       if (use_cache) {
@@ -1341,7 +1403,7 @@ int32_t GraphTable::random_sample_neighbors(
               res.size() * (need_weight ? (Node::id_size + Node::weight_size)
                                         : Node::id_size);
           int offset = 0;
-          int64_t id;
+          uint64_t id;
           float weight;
           char *buffer_addr = new char[actual_size];
           if (response == LRUResponse::ok) {
@@ -1376,13 +1438,14 @@ int32_t GraphTable::random_sample_neighbors(
   return 0;
 }
 
-int32_t GraphTable::get_node_feat(int idx, const std::vector<int64_t> &node_ids,
+int32_t GraphTable::get_node_feat(int idx,
+                                  const std::vector<uint64_t> &node_ids,
                                   const std::vector<std::string> &feature_names,
                                   std::vector<std::vector<std::string>> &res) {
   size_t node_num = node_ids.size();
   std::vector<std::future<int>> tasks;
   for (size_t idy = 0; idy < node_num; ++idy) {
-    int64_t node_id = node_ids[idy];
+    uint64_t node_id = node_ids[idy];
     tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
         [&, idx, idy, node_id]() -> int {
           Node *node = find_node(1, idx, node_id);
@@ -1410,13 +1473,13 @@ int32_t GraphTable::get_node_feat(int idx, const std::vector<int64_t> &node_ids,
 }
 
 int32_t GraphTable::set_node_feat(
-    int idx, const std::vector<int64_t> &node_ids,
+    int idx, const std::vector<uint64_t> &node_ids,
     const std::vector<std::string> &feature_names,
     const std::vector<std::vector<std::string>> &res) {
   size_t node_num = node_ids.size();
   std::vector<std::future<int>> tasks;
   for (size_t idy = 0; idy < node_num; ++idy) {
-    int64_t node_id = node_ids[idy];
+    uint64_t node_id = node_ids[idy];
     tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
         [&, idx, idy, node_id]() -> int {
           size_t index = node_id % this->shard_num - this->shard_start;
@@ -1439,59 +1502,126 @@ int32_t GraphTable::set_node_feat(
   return 0;
 }
 
-std::pair<int32_t, std::string> GraphTable::parse_feature(
-    int idx, std::string feat_str) {
+void string_vector_2_string(std::vector<std::string>::iterator strs_begin,
+        std::vector<std::string>::iterator strs_end, char delim, std::string* output) {
+  size_t i = 0;
+  for (std::vector<std::string>::iterator iter = strs_begin; iter != strs_end; ++iter) {
+    if (i > 0) {
+      *output += delim;
+    }
+
+    *output += *iter;
+    ++i;
+  }
+}
+
+int GraphTable::parse_feature(int idx, const std::string& feat_str,
+                            FeatureNode* node) {
   // Return (feat_id, btyes) if name are in this->feat_name, else return (-1,
   // "")
-  auto fields = paddle::string::split_string<std::string>(feat_str, " ");
-  if (feat_id_map[idx].count(fields[0])) {
-    // if (this->feat_id_map.count(fields[0])) {
-    int32_t id = this->feat_id_map[idx][fields[0]];
+  std::vector<std::string> fields =
+      paddle::string::split_string<std::string>(feat_str, feature_separator_);
+  auto it = feat_id_map[idx].find(fields[0]);
+  if (it != feat_id_map[idx].end()) {
+    int32_t id = it->second;
+    std::string* fea_ptr = node->mutable_feature(id);
     std::string dtype = this->feat_dtype[idx][id];
-    std::vector<std::string> values(fields.begin() + 1, fields.end());
     if (dtype == "feasign") {
-      return std::make_pair<int32_t, std::string>(
-          int32_t(id), paddle::string::join_strings(values, ' '));
+      string_vector_2_string(fields.begin() + 1, fields.end(), ' ', fea_ptr);
+      return 0;
     } else if (dtype == "string") {
-      return std::make_pair<int32_t, std::string>(
-          int32_t(id), paddle::string::join_strings(values, ' '));
+      string_vector_2_string(fields.begin() + 1, fields.end(), ' ', fea_ptr);
+      return 0;
     } else if (dtype == "float32") {
-      return std::make_pair<int32_t, std::string>(
-          int32_t(id), FeatureNode::parse_value_to_bytes<float>(values));
+      FeatureNode::parse_value_to_bytes<float>(fields.begin() + 1, fields.end(), fea_ptr);
+      return 0;
     } else if (dtype == "float64") {
-      return std::make_pair<int32_t, std::string>(
-          int32_t(id), FeatureNode::parse_value_to_bytes<double>(values));
+      FeatureNode::parse_value_to_bytes<double>(fields.begin() + 1, fields.end(), fea_ptr);
+      return 0;
     } else if (dtype == "int32") {
-      return std::make_pair<int32_t, std::string>(
-          int32_t(id), FeatureNode::parse_value_to_bytes<int32_t>(values));
+      FeatureNode::parse_value_to_bytes<int32_t>(fields.begin() + 1, fields.end(), fea_ptr);
+      return 0;
     } else if (dtype == "int64") {
-      return std::make_pair<int32_t, std::string>(
-          int32_t(id), FeatureNode::parse_value_to_bytes<int64_t>(values));
+      FeatureNode::parse_value_to_bytes<uint64_t>(fields.begin() + 1, fields.end(), fea_ptr);
+      return 0;
     }
+  } else {
+    VLOG(2) << "feature_name[" << fields[0]
+            << "] is not in feat_id_map, ntype_id[" << idx
+            << "] feat_id_map_size[" << feat_id_map.size() << "]";
   }
-  return std::make_pair<int32_t, std::string>(-1, "");
+
+  return -1;
+}
+
+std::vector<std::vector<uint64_t>> GraphTable::get_all_id(int type_id, int slice_num) {
+  std::vector<std::vector<uint64_t>> res(slice_num);
+  auto &search_shards = type_id == 0 ? edge_shards : feature_shards;
+  std::vector<std::future<std::vector<uint64_t>>> tasks;
+  for (int idx = 0; idx < search_shards.size(); idx++) {
+    for (int j = 0; j < search_shards[idx].size(); j++) {
+      tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue(
+        [&search_shards, idx, j]() -> std::vector<uint64_t> {
+          return search_shards[idx][j]->get_all_id();
+        }));
+    }
+  }
+  for (size_t i = 0; i < tasks.size(); ++i) {
+    tasks[i].wait();
+  }
+  for (size_t i = 0; i < tasks.size(); i++) {
+    auto ids = tasks[i].get();
+    for (auto &id : ids) {
+      res[(uint64_t)(id) % slice_num].push_back(id);
+    }
+  }
+  return res;
 }
 
-std::vector<std::vector<int64_t>> GraphTable::get_all_id(int type_id, int idx,
-                                                         int slice_num) {
-  std::vector<std::vector<int64_t>> res(slice_num);
+std::vector<std::vector<uint64_t>> GraphTable::get_all_id(int type_id, int idx,
+                                                          int slice_num) {
+  std::vector<std::vector<uint64_t>> res(slice_num);
   auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
-  std::vector<std::future<std::vector<int64_t>>> tasks;
+  std::vector<std::future<std::vector<uint64_t>>> tasks;
+  VLOG(0) << "begin task, task_pool_size_[" << task_pool_size_ << "]";
   for (int i = 0; i < search_shards.size(); i++) {
     tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
-        [&search_shards, i]() -> std::vector<int64_t> {
+        [&search_shards, i]() -> std::vector<uint64_t> {
           return search_shards[i]->get_all_id();
         }));
   }
   for (size_t i = 0; i < tasks.size(); ++i) {
     tasks[i].wait();
   }
+  VLOG(0) << "end task, task_pool_size_[" << task_pool_size_ << "]";
   for (size_t i = 0; i < tasks.size(); i++) {
     auto ids = tasks[i].get();
-    for (auto &id : ids) res[(uint64_t)(id) % slice_num].push_back(id);
+    for (auto &id : ids) res[id % slice_num].push_back(id);
   }
   return res;
 }
+
+int GraphTable::get_all_feature_ids(int type_id, int idx, int slice_num,
+                                    std::vector<std::vector<uint64_t>>* output) {
+  output->resize(slice_num);
+  auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
+  std::vector<std::future<std::vector<uint64_t>>> tasks;
+  for (int i = 0; i < search_shards.size(); i++) {
+    tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+        [&search_shards, i]() -> std::vector<uint64_t> {
+          return search_shards[i]->get_all_feature_ids();
+        }));
+  }
+  for (size_t i = 0; i < tasks.size(); ++i) {
+    tasks[i].wait();
+  }
+  for (size_t i = 0; i < tasks.size(); i++) {
+    auto ids = tasks[i].get();
+    for (auto &id : ids) (*output)[id % slice_num].push_back(id);
+  }
+  return 0;
+}
+
 int32_t GraphTable::pull_graph_list(int type_id, int idx, int start,
                                     int total_size,
                                     std::unique_ptr<char[]> &buffer,
@@ -1542,7 +1672,11 @@ int32_t GraphTable::pull_graph_list(int type_id, int idx, int start,
   return 0;
 }
 
-int32_t GraphTable::get_server_index_by_id(int64_t id) {
+void GraphTable::set_feature_separator(const std::string &ch) {
+  feature_separator_ = ch;
+}
+
+int32_t GraphTable::get_server_index_by_id(uint64_t id) {
   return id % shard_num / shard_num_per_server;
 }
 int32_t GraphTable::Initialize(const TableParameter &config,
@@ -1617,6 +1751,10 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) {
     _shards_task_pool[i].reset(new ::ThreadPool(1));
     _shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0));
   }
+  load_node_edge_task_pool.resize(load_thread_num);
+  for (size_t i = 0; i< load_node_edge_task_pool.size(); i++) {
+    load_node_edge_task_pool[i].reset(new ::ThreadPool(1));
+  }
   auto graph_feature = graph.graph_feature();
   auto node_types = graph.node_types();
   auto edge_types = graph.edge_types();
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
index 25bec5276e7293..5692bbfd3b8dd2 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -56,33 +56,44 @@ class GraphShard {
   ~GraphShard();
   std::vector<Node *> &get_bucket() { return bucket; }
   std::vector<Node *> get_batch(int start, int end, int step);
-  std::vector<int64_t> get_ids_by_range(int start, int end) {
-    std::vector<int64_t> res;
+  std::vector<uint64_t> get_ids_by_range(int start, int end) {
+    std::vector<uint64_t> res;
     for (int i = start; i < end && i < (int)bucket.size(); i++) {
       res.push_back(bucket[i]->get_id());
     }
     return res;
   }
-  std::vector<int64_t> get_all_id() {
-    std::vector<int64_t> res;
+  std::vector<uint64_t> get_all_id() {
+    std::vector<uint64_t> res;
     for (int i = 0; i < (int)bucket.size(); i++) {
       res.push_back(bucket[i]->get_id());
     }
     return res;
   }
-  GraphNode *add_graph_node(int64_t id);
+  std::vector<uint64_t> get_all_feature_ids() {
+    // TODO by huwei02, dedup
+    std::vector<uint64_t> total_res;
+    std::set<uint64_t> res;
+    for (int i = 0; i < (int)bucket.size(); i++) {
+      res.clear();
+      bucket[i]->get_feature_ids(&res);
+      total_res.insert(total_res.end(), res.begin(), res.end());
+    }
+    return total_res;
+  }
+  GraphNode *add_graph_node(uint64_t id);
   GraphNode *add_graph_node(Node *node);
-  FeatureNode *add_feature_node(int64_t id);
-  Node *find_node(int64_t id);
-  void delete_node(int64_t id);
+  FeatureNode *add_feature_node(uint64_t id, bool is_overlap = true);
+  Node *find_node(uint64_t id);
+  void delete_node(uint64_t id);
   void clear();
-  void add_neighbor(int64_t id, int64_t dst_id, float weight);
-  std::unordered_map<int64_t, int> &get_node_location() {
+  void add_neighbor(uint64_t id, uint64_t dst_id, float weight);
+  std::unordered_map<uint64_t, int> &get_node_location() {
     return node_location;
   }
 
  private:
-  std::unordered_map<int64_t, int> node_location;
+  std::unordered_map<uint64_t, int> node_location;
   std::vector<Node *> bucket;
 };
 
@@ -90,10 +101,10 @@ enum LRUResponse { ok = 0, blocked = 1, err = 2 };
 
 struct SampleKey {
   int idx;
-  int64_t node_key;
+  uint64_t node_key;
   size_t sample_size;
   bool is_weighted;
-  SampleKey(int _idx, int64_t _node_key, size_t _sample_size,
+  SampleKey(int _idx, uint64_t _node_key, size_t _sample_size,
             bool _is_weighted) {
     idx = _idx;
     node_key = _node_key;
@@ -455,7 +466,7 @@ class GraphTable : public Table {
                                   int step);
 
   virtual int32_t random_sample_neighbors(
-      int idx, int64_t *node_ids, int sample_size,
+      int idx, uint64_t *node_ids, int sample_size,
       std::vector<std::shared_ptr<char>> &buffers,
       std::vector<int> &actual_sizes, bool need_weight);
 
@@ -465,7 +476,7 @@ class GraphTable : public Table {
 
   virtual int32_t get_nodes_ids_by_ranges(
       int type_id, int idx, std::vector<std::pair<int, int>> ranges,
-      std::vector<int64_t> &res);
+      std::vector<uint64_t> &res);
   virtual int32_t Initialize() { return 0; }
   virtual int32_t Initialize(const TableParameter &config,
                              const FsClientParameter &fs_config);
@@ -475,17 +486,21 @@ class GraphTable : public Table {
   int32_t load_edges(const std::string &path, bool reverse,
                      const std::string &edge_type);
 
-  std::vector<std::vector<int64_t>> get_all_id(int type, int idx,
-                                               int slice_num);
+  std::vector<std::vector<uint64_t>> get_all_id(int type, int slice_num);
+  std::vector<std::vector<uint64_t>> get_all_id(int type, int idx,
+                                                int slice_num);
+  int get_all_feature_ids(int type, int idx,
+                        int slice_num, std::vector<std::vector<uint64_t>>* output);
   int32_t load_nodes(const std::string &path, std::string node_type);
 
-  int32_t add_graph_node(int idx, std::vector<int64_t> &id_list,
+  int32_t add_graph_node(int idx, std::vector<uint64_t> &id_list,
                          std::vector<bool> &is_weight_list);
 
-  int32_t remove_graph_node(int idx, std::vector<int64_t> &id_list);
+  int32_t remove_graph_node(int idx, std::vector<uint64_t> &id_list);
 
-  int32_t get_server_index_by_id(int64_t id);
-  Node *find_node(int type_id, int idx, int64_t id);
+  int32_t get_server_index_by_id(uint64_t id);
+  Node *find_node(int type_id, int idx, uint64_t id);
+  Node *find_node(int type_id, uint64_t id);
 
   virtual int32_t Pull(TableContext &context) { return 0; }
   virtual int32_t Push(TableContext &context) { return 0; }
@@ -510,17 +525,17 @@ class GraphTable : public Table {
     this->server_num = server_num;
     return 0;
   }
-  virtual uint32_t get_thread_pool_index_by_shard_index(int64_t shard_index);
-  virtual uint32_t get_thread_pool_index(int64_t node_id);
-  virtual std::pair<int32_t, std::string> parse_feature(int idx,
-                                                        std::string feat_str);
+  virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index);
+  virtual uint32_t get_thread_pool_index(uint64_t node_id);
+  virtual int parse_feature(int idx, const std::string& feat_str,
+                            FeatureNode* node);
 
-  virtual int32_t get_node_feat(int idx, const std::vector<int64_t> &node_ids,
+  virtual int32_t get_node_feat(int idx, const std::vector<uint64_t> &node_ids,
                                 const std::vector<std::string> &feature_names,
                                 std::vector<std::vector<std::string>> &res);
 
   virtual int32_t set_node_feat(
-      int idx, const std::vector<int64_t> &node_ids,
+      int idx, const std::vector<uint64_t> &node_ids,
       const std::vector<std::string> &feature_names,
       const std::vector<std::vector<std::string>> &res);
 
@@ -554,20 +569,22 @@ class GraphTable : public Table {
   virtual void make_partitions(int idx, int64_t gb_size, int device_len);
   virtual void export_partition_files(int idx, std::string file_path);
   virtual char *random_sample_neighbor_from_ssd(
-      int idx, int64_t id, int sample_size,
+      int idx, uint64_t id, int sample_size,
       const std::shared_ptr<std::mt19937_64> rng, int &actual_size);
-  virtual int32_t add_node_to_ssd(int type_id, int idx, int64_t src_id,
+  virtual int32_t add_node_to_ssd(int type_id, int idx, uint64_t src_id,
                                   char *data, int len);
   virtual paddle::framework::GpuPsCommGraph make_gpu_ps_graph(
-      int idx, std::vector<int64_t> ids);
+      int idx, std::vector<uint64_t> ids);
+  virtual paddle::framework::GpuPsCommGraphFea make_gpu_ps_graph_fea(
+      std::vector<uint64_t> &node_ids, int slot_num);
   int32_t Load_to_ssd(const std::string &path, const std::string &param);
-  int64_t load_graph_to_memory_from_ssd(int idx, std::vector<int64_t> &ids);
+  int64_t load_graph_to_memory_from_ssd(int idx, std::vector<uint64_t> &ids);
   int32_t make_complementary_graph(int idx, int64_t byte_size);
   int32_t dump_edges_to_ssd(int idx);
   int32_t get_partition_num(int idx) { return partitions[idx].size(); }
-  std::vector<int64_t> get_partition(int idx, int index) {
-    if (idx >= partitions.size() || index >= partitions[idx].size())
-      return std::vector<int64_t>();
+  std::vector<uint64_t> get_partition(int idx, int index) {
+    if (idx >= (int)partitions.size() || index >= (int)partitions[idx].size())
+      return std::vector<uint64_t>();
     return partitions[idx][index];
   }
   int32_t load_edges_to_ssd(const std::string &path, bool reverse_edge,
@@ -576,17 +593,19 @@ class GraphTable : public Table {
   void set_search_level(int search_level) { this->search_level = search_level; }
   int search_level;
   int64_t total_memory_cost;
-  std::vector<std::vector<std::vector<int64_t>>> partitions;
+  std::vector<std::vector<std::vector<uint64_t>>> partitions;
   int next_partition;
 #endif
-  virtual int32_t add_comm_edge(int idx, int64_t src_id, int64_t dst_id);
+  virtual int32_t add_comm_edge(int idx, uint64_t src_id, uint64_t dst_id);
   virtual int32_t build_sampler(int idx, std::string sample_type = "random");
+  void set_feature_separator(const std::string &ch);
   std::vector<std::vector<GraphShard *>> edge_shards, feature_shards;
   size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num;
   int task_pool_size_ = 24;
+  int load_thread_num = 150;
   const int random_sample_nodes_ranges = 3;
 
-  std::vector<std::vector<std::unordered_map<int64_t, double>>> node_weight;
+  std::vector<std::vector<std::unordered_map<uint64_t, double>>> node_weight;
   std::vector<std::vector<std::string>> feat_name;
   std::vector<std::vector<std::string>> feat_dtype;
   std::vector<std::vector<int32_t>> feat_shape;
@@ -598,9 +617,10 @@ class GraphTable : public Table {
 
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
   std::vector<std::shared_ptr<std::mt19937_64>> _shards_task_rng_pool;
+  std::vector<std::shared_ptr<::ThreadPool>> load_node_edge_task_pool;
   std::shared_ptr<ScaledLRU<SampleKey, SampleResult>> scaled_lru;
-  std::unordered_set<int64_t> extra_nodes;
-  std::unordered_map<int64_t, size_t> extra_nodes_to_thread_index;
+  std::unordered_set<uint64_t> extra_nodes;
+  std::unordered_map<uint64_t, size_t> extra_nodes_to_thread_index;
   bool use_cache, use_duplicate_nodes;
   int cache_size_limit;
   int cache_ttl;
@@ -609,9 +629,10 @@ class GraphTable : public Table {
 #ifdef PADDLE_WITH_HETERPS
   // paddle::framework::GpuPsGraphTable gpu_graph_table;
   paddle::distributed::RocksDBHandler *_db;
-// std::shared_ptr<::ThreadPool> graph_sample_pool;
-// std::shared_ptr<GraphSampler> graph_sampler;
-// REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler)
+  // std::shared_ptr<::ThreadPool> graph_sample_pool;
+  // std::shared_ptr<GraphSampler> graph_sampler;
+  // REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler)
+  std::string feature_separator_ = std::string(" ");
 #endif
 };
 
@@ -630,7 +651,7 @@ class CompleteGraphSampler : public GraphSampler {
  protected:
   GraphTable *graph_table;
   std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
-  std::vector<std::vector<int64_t>> sample_neighbors;
+  std::vector<std::vector<uint64_t>> sample_neighbors;
   // std::vector<GpuPsCommGraph> sample_res;
   // std::shared_ptr<std::mt19937_64> random;
   int gpu_num;
@@ -649,11 +670,11 @@ class BasicBfsGraphSampler : public GraphSampler {
   GraphTable *graph_table;
   // std::vector<std::vector<GpuPsGraphNode>> sample_nodes;
   std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
-  std::vector<std::vector<int64_t>> sample_neighbors;
+  std::vector<std::vector<uint64_t>> sample_neighbors;
   size_t gpu_num;
   int init_search_size, node_num_for_each_shard, edge_num_for_each_node;
   int rounds, interval;
-  std::vector<std::unordered_map<int64_t, std::vector<int64_t>>>
+  std::vector<std::unordered_map<uint64_t, std::vector<uint64_t>>>
       sample_neighbors_map;
 };
 #endif
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h
index c6c594036d4fc9..5f567d0c4b4931 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_node.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h
@@ -18,7 +18,12 @@
 #include <memory>
 #include <sstream>
 #include <vector>
+#include <set>
+#include "glog/logging.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h"
+#include "paddle/fluid/string/string_helper.h"
+#include "paddle/phi/core/enforce.h"
+
 namespace paddle {
 namespace distributed {
 
@@ -29,6 +34,7 @@ class Node {
   virtual ~Node() {}
   static int id_size, int_size, weight_size;
   uint64_t get_id() { return id; }
+  int64_t get_py_id() { return (int64_t)id; }
   void set_id(uint64_t id) { this->id = id; }
 
   virtual void build_edges(bool is_weighted) {}
@@ -45,7 +51,13 @@ class Node {
   virtual void to_buffer(char *buffer, bool need_feature);
   virtual void recover_from_buffer(char *buffer);
   virtual std::string get_feature(int idx) { return std::string(""); }
-  virtual void set_feature(int idx, std::string str) {}
+  virtual int get_feature_ids(std::set<uint64_t> *res) const {
+    return 0;
+  }
+  virtual int get_feature_ids(int slot_idx, std::vector<uint64_t> *res) const {
+    return 0;
+  }
+  virtual void set_feature(int idx, const std::string& str) {}
   virtual void set_feature_size(int size) {}
   virtual int get_feature_size() { return 0; }
   virtual size_t get_neighbor_size() { return 0; }
@@ -94,7 +106,51 @@ class FeatureNode : public Node {
     }
   }
 
-  virtual void set_feature(int idx, std::string str) {
+  virtual int get_feature_ids(std::set<uint64_t> *res) const {
+    PADDLE_ENFORCE_NOT_NULL(res);
+    errno = 0;
+    for (auto& feature_item: feature) {
+      const char *feat_str = feature_item.c_str();
+      auto fields = paddle::string::split_string<std::string>(feat_str, " ");
+      char *head_ptr = NULL;
+      for (auto &field : fields) {
+        PADDLE_ENFORCE_EQ(field.empty(), false);
+        uint64_t feasign = strtoull(field.c_str(), &head_ptr, 10);
+        PADDLE_ENFORCE_EQ(field.c_str() + field.length(), head_ptr);
+        res->insert(feasign);
+      }
+    }
+    PADDLE_ENFORCE_EQ(errno, 0);
+    return 0;
+  }
+
+  virtual int get_feature_ids(int slot_idx, std::vector<uint64_t> *res) const {
+    PADDLE_ENFORCE_NOT_NULL(res);
+    res->clear();
+    errno = 0;
+    if (slot_idx < (int)this->feature.size()) {
+      const char *feat_str = this->feature[slot_idx].c_str();
+      auto fields = paddle::string::split_string<std::string>(feat_str, " ");
+      char *head_ptr = NULL;
+      for (auto &field : fields) {
+        PADDLE_ENFORCE_EQ(field.empty(), false);
+        uint64_t feasign = strtoull(field.c_str(), &head_ptr, 10);
+        PADDLE_ENFORCE_EQ(field.c_str() + field.length(), head_ptr);
+        res->push_back(feasign);
+      }
+    }
+    PADDLE_ENFORCE_EQ(errno, 0);
+    return 0;
+  }
+
+  virtual std::string* mutable_feature(int idx) {
+    if (idx >= (int)this->feature.size()) {
+      this->feature.resize(idx + 1);
+    }
+    return &(this->feature[idx]);
+  }
+
+  virtual void set_feature(int idx, const std::string& str) {
     if (idx >= (int)this->feature.size()) {
       this->feature.resize(idx + 1);
     }
@@ -116,6 +172,22 @@ class FeatureNode : public Node {
     return std::string(buffer, Tsize);
   }
 
+  template <typename T>
+  static void parse_value_to_bytes(std::vector<std::string>::iterator feat_str_begin,
+                                std::vector<std::string>::iterator feat_str_end,
+                                std::string* output) {
+    T v;
+    size_t feat_str_size = feat_str_end - feat_str_begin;
+    size_t Tsize = sizeof(T) * feat_str_size;
+    char buffer[Tsize] = {'\0'};
+    for (size_t i = 0; i < feat_str_size; i++) {
+      std::stringstream ss(*(feat_str_begin + i));
+      ss >> v;
+      std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T));
+    }
+    output->assign(buffer);
+  }
+
   template <typename T>
   static std::vector<T> parse_bytes_to_array(std::string feat_str) {
     T v;
@@ -130,8 +202,9 @@ class FeatureNode : public Node {
     return out;
   }
 
- protected:
+protected:
   std::vector<std::string> feature;
 };
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index bb7f3f26463d49..e4f8b76f0dff26 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -321,7 +321,9 @@ if(WITH_DISTRIBUTE)
             device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
             index_sampler index_wrapper sampler index_dataset_proto
             lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method
-            graph_to_program_pass variable_helper timer monitor heter_service_proto fleet heter_server brpc fleet_executor)
+            graph_to_program_pass variable_helper timer monitor
+            heter_service_proto fleet heter_server brpc fleet_executor
+            graph_gpu_wrapper)
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses")
     if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
         set(DISTRIBUTE_COMPILE_FLAGS
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index b63f317aae8932..7962e1591f0faf 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -2065,6 +2065,7 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) {
   } else {
     so_parser_name_.clear();
   }
+  gpu_graph_data_generator_.SetConfig(data_feed_desc);
 }
 
 void SlotRecordInMemoryDataFeed::LoadIntoMemory() {
@@ -2589,6 +2590,7 @@ bool SlotRecordInMemoryDataFeed::Start() {
 #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
   CHECK(paddle::platform::is_gpu_place(this->place_));
   pack_ = BatchGpuPackMgr().get(this->GetPlace(), used_slots_info_);
+  gpu_graph_data_generator_.AllocResource(this->place_, feed_vec_);
 #endif
   return true;
 }
@@ -2596,27 +2598,31 @@ bool SlotRecordInMemoryDataFeed::Start() {
 int SlotRecordInMemoryDataFeed::Next() {
 #ifdef _LINUX
   this->CheckStart();
-
-  VLOG(3) << "enable heter next: " << offset_index_
-          << " batch_offsets: " << batch_offsets_.size();
-  if (offset_index_ >= batch_offsets_.size()) {
-    VLOG(3) << "offset_index: " << offset_index_
+  if (!gpu_graph_mode_) {
+    VLOG(3) << "enable heter next: " << offset_index_
             << " batch_offsets: " << batch_offsets_.size();
-    return 0;
-  }
-  auto& batch = batch_offsets_[offset_index_++];
-  this->batch_size_ = batch.second;
-  VLOG(3) << "batch_size_=" << this->batch_size_
-          << ", thread_id=" << thread_id_;
-  if (this->batch_size_ != 0) {
-    PutToFeedVec(&records_[batch.first], this->batch_size_);
+    if (offset_index_ >= batch_offsets_.size()) {
+      VLOG(3) << "offset_index: " << offset_index_
+              << " batch_offsets: " << batch_offsets_.size();
+      return 0;
+    }
+    auto& batch = batch_offsets_[offset_index_++];
+    this->batch_size_ = batch.second;
+    VLOG(3) << "batch_size_=" << this->batch_size_
+            << ", thread_id=" << thread_id_;
+    if (this->batch_size_ != 0) {
+      PutToFeedVec(&records_[batch.first], this->batch_size_);
+    } else {
+      VLOG(3) << "finish reading for heterps, batch size zero, thread_id="
+              << thread_id_;
+    }
+    VLOG(3) << "enable heter next: " << offset_index_
+            << " batch_offsets: " << batch_offsets_.size()
+            << " baych_size: " << this->batch_size_;
   } else {
-    VLOG(3) << "finish reading for heterps, batch size zero, thread_id="
-            << thread_id_;
+    VLOG(3) << "datafeed in gpu graph mode";
+    this->batch_size_ = gpu_graph_data_generator_.GenerateBatch();
   }
-  VLOG(3) << "enable heter next: " << offset_index_
-          << " batch_offsets: " << batch_offsets_.size()
-          << " baych_size: " << this->batch_size_;
 
   return this->batch_size_;
 #else
diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu
index f9435ec2a32d84..1814fa44da62c4 100644
--- a/paddle/fluid/framework/data_feed.cu
+++ b/paddle/fluid/framework/data_feed.cu
@@ -17,7 +17,14 @@ limitations under the License. */
 #endif
 #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
 
+#include <thrust/device_ptr.h>
+#include <thrust/random.h>
+#include <thrust/shuffle.h>
+#include <sstream>
+#include "cub/cub.cuh"
 #include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
 
 namespace paddle {
 namespace framework {
@@ -144,6 +151,714 @@ void SlotRecordInMemoryDataFeed::CopyForTensor(
   cudaStreamSynchronize(stream);
 }
 
+__global__ void GraphFillCVMKernel(int64_t *tensor, int len) {
+  CUDA_KERNEL_LOOP(idx, len) { tensor[idx] = 1; }
+}
+
+int GraphDataGenerator::AcquireInstance(BufState *state) {
+  //
+  if (state->GetNextStep()) {
+    state->Debug();
+    return state->len;
+  } else if (state->GetNextCentrolWord()) {
+    state->Debug();
+    return state->len;
+  } else if (state->GetNextBatch()) {
+    state->Debug();
+    return state->len;
+  }
+  return 0;
+}
+
+// TODO opt
+__global__ void GraphFillFeatureKernel(int64_t *id_tensor, int *fill_ins_num,
+                    int64_t *walk, int64_t *feature, int *row, int central_word,
+                    int step, int len, int col_num, int slot_num) {
+  __shared__ int64_t local_key[CUDA_NUM_THREADS * 2];
+  __shared__ int local_num;
+  __shared__ int global_num;
+
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (threadIdx.x == 0) {
+    local_num = 0;
+  }
+  __syncthreads();
+  if (idx < len) {
+    int src = row[idx] * col_num + central_word;
+    if (walk[src] != 0 && walk[src + step] != 0) {
+      size_t dst = atomicAdd(&local_num, 1);
+      for (int i = 0; i < slot_num; ++i) {
+        local_key[dst * 2 * slot_num + i * 2] = feature[src * slot_num + i];
+        local_key[dst * 2 * slot_num + i * 2 + 1] = feature[(src + step) * slot_num + i];
+      }
+    }
+  }
+
+  if (threadIdx.x == 0) {
+    global_num = atomicAdd(fill_ins_num, local_num);
+  }
+  __syncthreads();
+
+  if (threadIdx.x < local_num) {
+    for (int i = 0; i < slot_num; ++i) {
+      id_tensor[(global_num * 2 + 2 * threadIdx.x) * slot_num + i]
+          = local_key[(2 * threadIdx.x) * slot_num + i];
+      id_tensor[(global_num * 2 + 2 * threadIdx.x + 1) * slot_num + i] =
+          local_key[(2 * threadIdx.x + 1) * slot_num + i];
+    }
+  }
+}
+
+__global__ void GraphFillIdKernel(int64_t *id_tensor, int *fill_ins_num,
+                                  int64_t *walk, int *row, int central_word,
+                                  int step, int len, int col_num) {
+  __shared__ int64_t local_key[CUDA_NUM_THREADS * 2];
+  __shared__ int local_num;
+  __shared__ int global_num;
+
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (threadIdx.x == 0) {
+    local_num = 0;
+  }
+  __syncthreads();
+  // int dst = idx * 2;
+  // id_tensor[dst] = walk[src];
+  // id_tensor[dst + 1] = walk[src + step];
+  if (idx < len) {
+    int src = row[idx] * col_num + central_word;
+    if (walk[src] != 0 && walk[src + step] != 0) {
+      size_t dst = atomicAdd(&local_num, 1);
+      local_key[dst * 2] = walk[src];
+      local_key[dst * 2 + 1] = walk[src + step];
+    }
+  }
+
+  if (threadIdx.x == 0) {
+    global_num = atomicAdd(fill_ins_num, local_num);
+  }
+  __syncthreads();
+
+  if (threadIdx.x < local_num) {
+    id_tensor[global_num * 2 + 2 * threadIdx.x] = local_key[2 * threadIdx.x];
+    id_tensor[global_num * 2 + 2 * threadIdx.x + 1] =
+        local_key[2 * threadIdx.x + 1];
+  }
+}
+
+__global__ void GraphFillSlotLodKernel(int64_t *id_tensor, int len) {
+  CUDA_KERNEL_LOOP(idx, len) {
+    id_tensor[idx] = idx;
+  }
+}
+
+int GraphDataGenerator::FillInsBuf() {
+  if (ins_buf_pair_len_ >= batch_size_) {
+    return batch_size_;
+  }
+  int total_instance = AcquireInstance(&buf_state_);
+
+  VLOG(2) << "total_ins: " << total_instance;
+  buf_state_.Debug();
+
+  if (total_instance == 0) {
+    int res = FillWalkBuf(d_walk_);
+    if (!res) {
+      // graph iterate complete
+      return -1;
+    } else {
+      total_instance = buf_state_.len;
+      VLOG(2) << "total_ins: " << total_instance;
+      buf_state_.Debug();
+      // if (total_instance == 0) {
+      //  return -1;
+      //}
+    }
+
+    if (slot_num_ > 0) {
+      FillFeatureBuf(d_walk_, d_feature_);
+      if (debug_mode_) {
+        int len = buf_size_ > 5000? 5000: buf_size_;
+        uint64_t h_walk[len];
+        cudaMemcpy(h_walk, d_walk_->ptr(), len * sizeof(uint64_t),
+                cudaMemcpyDeviceToHost);
+        uint64_t h_feature[len * slot_num_];
+        cudaMemcpy(h_feature, d_feature_->ptr(), len * slot_num_ * sizeof(uint64_t),
+                cudaMemcpyDeviceToHost);
+        for(int i = 0; i < len; ++i) {
+          std::stringstream ss;
+          for (int j = 0; j < slot_num_; ++j) {
+            ss << h_feature[i * slot_num_ + j] << " ";
+          }
+          VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i << "] = " << (uint64_t)h_walk[i]
+              << " feature[" << i * slot_num_ << ".." << (i + 1) * slot_num_ << "] = " << ss.str();
+        }
+      }
+    }
+  }
+
+  int64_t *walk = reinterpret_cast<int64_t *>(d_walk_->ptr());
+  int64_t *ins_buf = reinterpret_cast<int64_t *>(d_ins_buf_->ptr());
+  int *random_row = reinterpret_cast<int *>(d_random_row_->ptr());
+  int *d_pair_num = reinterpret_cast<int *>(d_pair_num_->ptr());
+  cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_);
+  int len = buf_state_.len;
+  GraphFillIdKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+      ins_buf + ins_buf_pair_len_ * 2, d_pair_num, walk,
+      random_row + buf_state_.cursor, buf_state_.central_word,
+      window_step_[buf_state_.step], len, walk_len_);
+  int h_pair_num;
+  cudaMemcpyAsync(&h_pair_num, d_pair_num, sizeof(int), cudaMemcpyDeviceToHost,
+                  stream_);
+
+  int64_t *feature_buf = reinterpret_cast<int64_t *>(d_feature_buf_->ptr());
+  if (slot_num_ > 0) {
+    int64_t *feature = reinterpret_cast<int64_t *>(d_feature_->ptr());
+    cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_);
+    int len = buf_state_.len;
+    VLOG(2) << "feature_buf start[" << ins_buf_pair_len_ * 2 * slot_num_ << "] len[" << len << "]";
+    GraphFillFeatureKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+        feature_buf + ins_buf_pair_len_ * 2 * slot_num_, d_pair_num, walk, feature,
+        random_row + buf_state_.cursor, buf_state_.central_word,
+        window_step_[buf_state_.step], len, walk_len_, slot_num_);
+  }
+
+  cudaStreamSynchronize(stream_);
+  ins_buf_pair_len_ += h_pair_num;
+
+  if (debug_mode_) {
+    int64_t *h_ins_buf = new int64_t[ins_buf_pair_len_ * 2];
+    cudaMemcpy(h_ins_buf, ins_buf, 2 * ins_buf_pair_len_ * sizeof(int64_t),
+               cudaMemcpyDeviceToHost);
+    VLOG(2) << "h_pair_num = " << h_pair_num
+            << ", ins_buf_pair_len = " << ins_buf_pair_len_;
+    for (int xx = 0; xx < 2 * ins_buf_pair_len_; xx++) {
+      VLOG(2) << "h_ins_buf[" << xx << "]: " << h_ins_buf[xx];
+    }
+    delete[] h_ins_buf;
+
+    int64_t h_feature_buf[(batch_size_ * 2 * 2) * slot_num_];
+    cudaMemcpy(h_feature_buf, feature_buf, (batch_size_ * 2 * 2) * slot_num_ * sizeof(int64_t),
+               cudaMemcpyDeviceToHost);
+    for (int xx = 0; xx < (batch_size_ * 2 * 2) * slot_num_; xx++) {
+      VLOG(2) << "h_feature_buf[" << xx << "]: " << h_feature_buf[xx];
+    }
+  }
+  return ins_buf_pair_len_;
+}
+
+
+int GraphDataGenerator::GenerateBatch() {
+  platform::CUDADeviceGuard guard(gpuid_);
+  int res = 0;
+  while (ins_buf_pair_len_ < batch_size_) {
+    res = FillInsBuf();
+    if (res == -1) {
+      if (ins_buf_pair_len_ == 0) {
+        return 0;
+      } else {
+        break;
+      }
+    }
+  }
+  int total_instance =
+      ins_buf_pair_len_ < batch_size_ ? ins_buf_pair_len_ : batch_size_;
+
+  total_instance *= 2;
+  id_tensor_ptr_ =
+      feed_vec_[0]->mutable_data<int64_t>({total_instance, 1}, this->place_);
+  show_tensor_ptr_ =
+      feed_vec_[1]->mutable_data<int64_t>({total_instance}, this->place_);
+  clk_tensor_ptr_ =
+      feed_vec_[2]->mutable_data<int64_t>({total_instance}, this->place_);
+
+  int64_t* slot_tensor_ptr_[slot_num_];
+  int64_t* slot_lod_tensor_ptr_[slot_num_];
+  if (slot_num_ > 0) {
+    for (int i = 0; i < slot_num_; ++i) {
+      slot_tensor_ptr_[i] =
+          feed_vec_[3 + 2 * i]->mutable_data<int64_t>({total_instance, 1}, this->place_);
+      slot_lod_tensor_ptr_[i] =
+          feed_vec_[3 + 2 * i + 1]->mutable_data<int64_t>({total_instance + 1}, this->place_);
+    }
+  }
+
+  VLOG(2) << "total_instance: " << total_instance
+          << ", ins_buf_pair_len = " << ins_buf_pair_len_;
+  int64_t *ins_buf = reinterpret_cast<int64_t *>(d_ins_buf_->ptr());
+  int64_t *ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance;
+  cudaMemcpyAsync(id_tensor_ptr_, ins_cursor, sizeof(int64_t) * total_instance,
+                  cudaMemcpyDeviceToDevice, stream_);
+
+  GraphFillCVMKernel<<<GET_BLOCKS(total_instance), CUDA_NUM_THREADS, 0,
+                       stream_>>>(show_tensor_ptr_, total_instance);
+  GraphFillCVMKernel<<<GET_BLOCKS(total_instance), CUDA_NUM_THREADS, 0,
+                       stream_>>>(clk_tensor_ptr_, total_instance);
+
+  if (slot_num_ > 0) {
+    int64_t *feature_buf = reinterpret_cast<int64_t *>(d_feature_buf_->ptr());
+    for (int i = 0; i < slot_num_; ++i) {
+      int feature_buf_offset = (ins_buf_pair_len_ * 2 - total_instance) * slot_num_ + i * 2;
+      // TODO huwei02 opt
+      for (int j = 0; j < total_instance; j += 2) {
+        VLOG(2) << "slot_tensor[" << i << "][" << j << "] <- feature_buf["
+            << feature_buf_offset + j * slot_num_ << "]";
+        VLOG(2) << "slot_tensor[" << i << "][" << j + 1 << "] <- feature_buf["
+            << feature_buf_offset + j * slot_num_ + 1 << "]";
+        cudaMemcpyAsync(slot_tensor_ptr_[i] + j, &feature_buf[feature_buf_offset + j * slot_num_],
+                sizeof(int64_t) * 2, cudaMemcpyDeviceToDevice, stream_);
+      }
+      GraphFillSlotLodKernel<<<GET_BLOCKS(total_instance), CUDA_NUM_THREADS, 0, stream_>>>(
+              slot_lod_tensor_ptr_[i], total_instance + 1);
+    }
+  }
+
+  offset_.clear();
+  offset_.push_back(0);
+  offset_.push_back(total_instance);
+  LoD lod{offset_};
+  feed_vec_[0]->set_lod(lod);
+  if (slot_num_ > 0) {
+    for (int i = 0; i < slot_num_; ++i) {
+      feed_vec_[3 + 2 * i]->set_lod(lod);
+    }
+  }
+
+  ins_buf_pair_len_ -= total_instance / 2;
+
+  cudaStreamSynchronize(stream_);
+
+  if (debug_mode_) {
+    int64_t h_slot_tensor[slot_num_][total_instance];
+    int64_t h_slot_lod_tensor[slot_num_][total_instance + 1];
+    for (int i = 0; i < slot_num_; ++i) {
+      cudaMemcpy(h_slot_tensor[i], slot_tensor_ptr_[i], total_instance * sizeof(int64_t),
+              cudaMemcpyDeviceToHost);
+      int len = total_instance > 5000? 5000: total_instance;
+      for(int j = 0; j < len; ++j) {
+        VLOG(2) << "gpu[" << gpuid_ << "] slot_tensor[" << i <<"][" << j << "] = " << h_slot_tensor[i][j];
+      }
+
+      cudaMemcpy(h_slot_lod_tensor[i], slot_lod_tensor_ptr_[i], (total_instance + 1) * sizeof(int64_t),
+              cudaMemcpyDeviceToHost);
+      len = total_instance + 1 > 5000? 5000: total_instance + 1;
+      for(int j = 0; j < len; ++j) {
+        VLOG(2) << "gpu[" << gpuid_ << "] slot_lod_tensor[" << i <<"][" << j << "] = " << h_slot_lod_tensor[i][j];
+      }
+    }
+  }
+
+  return 1;
+}
+
+__global__ void GraphFillSampleKeysKernel(uint64_t *neighbors,
+                                          uint64_t *sample_keys,
+                                          int *prefix_sum, int *sampleidx2row,
+                                          int *tmp_sampleidx2row,
+                                          int *actual_sample_size,
+                                          int cur_degree, int len) {
+  CUDA_KERNEL_LOOP(idx, len) {
+    for (int k = 0; k < actual_sample_size[idx]; k++) {
+      size_t offset = prefix_sum[idx] + k;
+      sample_keys[offset] = neighbors[idx * cur_degree + k];
+      tmp_sampleidx2row[offset] = sampleidx2row[idx] + k;
+    }
+  }
+}
+
+__global__ void GraphDoWalkKernel(uint64_t *neighbors, uint64_t *walk,
+                                  int *d_prefix_sum, int *actual_sample_size,
+                                  int cur_degree, int step, int len,
+                                  int *id_cnt, int *sampleidx2row,
+                                  int col_size) {
+  CUDA_KERNEL_LOOP(i, len) {
+    for (int k = 0; k < actual_sample_size[i]; k++) {
+      // int idx = sampleidx2row[i];
+      size_t row = sampleidx2row[k + d_prefix_sum[i]];
+      // size_t row = idx * cur_degree + k;
+      size_t col = step;
+      size_t offset = (row * col_size + col);
+      walk[offset] = neighbors[i * cur_degree + k];
+    }
+  }
+}
+
+// Fill keys to the first column of walk
+__global__ void GraphFillFirstStepKernel(int *prefix_sum, int *sampleidx2row,
+                                         uint64_t *walk, uint64_t *keys,
+                                         int len, int walk_degree, int col_size,
+                                         int *actual_sample_size,
+                                         uint64_t *neighbors,
+                                         uint64_t *sample_keys) {
+  CUDA_KERNEL_LOOP(idx, len) {
+    for (int k = 0; k < actual_sample_size[idx]; k++) {
+      size_t row = prefix_sum[idx] + k;
+      sample_keys[row] = neighbors[idx * walk_degree + k];
+      sampleidx2row[row] = row;
+
+      size_t offset = col_size * row;
+      walk[offset] = keys[idx];
+      walk[offset + 1] = neighbors[idx * walk_degree + k];
+    }
+  }
+}
+
+// Fill sample_res to the stepth column of walk
+void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids, uint64_t *walk,
+                                     int len, NeighborSampleResult &sample_res,
+                                     int cur_degree, int step,
+                                     int *len_per_row) {
+  size_t temp_storage_bytes = 0;
+  int *d_actual_sample_size = sample_res.actual_sample_size;
+  uint64_t *d_neighbors = sample_res.val;
+  int *d_prefix_sum = reinterpret_cast<int *>(d_prefix_sum_->ptr());
+  uint64_t *d_sample_keys = reinterpret_cast<uint64_t *>(d_sample_keys_->ptr());
+  int *d_sampleidx2row =
+      reinterpret_cast<int *>(d_sampleidx2rows_[cur_sampleidx2row_]->ptr());
+  int *d_tmp_sampleidx2row =
+      reinterpret_cast<int *>(d_sampleidx2rows_[1 - cur_sampleidx2row_]->ptr());
+
+  CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL, temp_storage_bytes,
+                                           d_actual_sample_size,
+                                           d_prefix_sum + 1, len, stream_));
+  auto d_temp_storage = memory::Alloc(place_, temp_storage_bytes);
+
+  CUDA_CHECK(cub::DeviceScan::InclusiveSum(
+      d_temp_storage->ptr(), temp_storage_bytes, d_actual_sample_size,
+      d_prefix_sum + 1, len, stream_));
+
+  cudaStreamSynchronize(stream_);
+
+  if (step == 1) {
+    GraphFillFirstStepKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+        d_prefix_sum, d_tmp_sampleidx2row, walk, d_start_ids, len, walk_degree_,
+        walk_len_, d_actual_sample_size, d_neighbors, d_sample_keys);
+
+  } else {
+    GraphFillSampleKeysKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0,
+                                stream_>>>(
+        d_neighbors, d_sample_keys, d_prefix_sum, d_sampleidx2row,
+        d_tmp_sampleidx2row, d_actual_sample_size, cur_degree, len);
+
+    GraphDoWalkKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+        d_neighbors, walk, d_prefix_sum, d_actual_sample_size, cur_degree, step,
+        len, len_per_row, d_tmp_sampleidx2row, walk_len_);
+  }
+  if (debug_mode_) {
+    size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_;
+    int *h_prefix_sum = new int[len + 1];
+    int *h_actual_size = new int[len];
+    int *h_offset2idx = new int[once_max_sample_keynum];
+    int64_t *h_sample_keys = new int64_t[once_max_sample_keynum];
+    cudaMemcpy(h_offset2idx, d_tmp_sampleidx2row,
+               once_max_sample_keynum * sizeof(int), cudaMemcpyDeviceToHost);
+
+    cudaMemcpy(h_prefix_sum, d_prefix_sum, (len + 1) * sizeof(int),
+               cudaMemcpyDeviceToHost);
+    for (int xx = 0; xx < once_max_sample_keynum; xx++) {
+      VLOG(2) << "h_offset2idx[" << xx << "]: " << h_offset2idx[xx];
+    }
+    for (int xx = 0; xx < len + 1; xx++) {
+      VLOG(2) << "h_prefix_sum[" << xx << "]: " << h_prefix_sum[xx];
+    }
+    delete[] h_prefix_sum;
+    delete[] h_actual_size;
+    delete[] h_offset2idx;
+    delete[] h_sample_keys;
+  }
+  cudaStreamSynchronize(stream_);
+  cur_sampleidx2row_ = 1 - cur_sampleidx2row_;
+}
+
+int GraphDataGenerator::FillFeatureBuf(std::shared_ptr<phi::Allocation> d_walk,
+                                    std::shared_ptr<phi::Allocation> d_feature) {
+  platform::CUDADeviceGuard guard(gpuid_);
+
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  int ret = gpu_graph_ptr->get_feature_of_nodes(gpuid_, d_walk, d_feature, buf_size_, slot_num_);
+  return ret;
+}
+
+int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
+  platform::CUDADeviceGuard guard(gpuid_);
+  size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_;
+  ////////
+  uint64_t *h_walk;
+  uint64_t *h_sample_keys;
+  int *h_offset2idx;
+  int *h_len_per_row;
+  uint64_t *h_prefix_sum;
+  if (debug_mode_) {
+    h_walk = new uint64_t[buf_size_];
+    h_sample_keys = new uint64_t[once_max_sample_keynum];
+    h_offset2idx = new int[once_max_sample_keynum];
+    h_len_per_row = new int[once_max_sample_keynum];
+    h_prefix_sum = new uint64_t[once_max_sample_keynum + 1];
+  }
+  ///////
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  uint64_t *walk = reinterpret_cast<uint64_t *>(d_walk->ptr());
+  int *len_per_row = reinterpret_cast<int *>(d_len_per_row_->ptr());
+  uint64_t *d_sample_keys = reinterpret_cast<uint64_t *>(d_sample_keys_->ptr());
+  cudaMemsetAsync(walk, 0, buf_size_ * sizeof(uint64_t), stream_);
+  cudaMemsetAsync(len_per_row, 0, once_max_sample_keynum * sizeof(int),
+                  stream_);
+  int i = 0;
+  int total_row = 0;
+  size_t node_type_len = first_node_type_.size();
+  int remain_size =
+      buf_size_ - walk_degree_ * once_sample_startid_len_ * walk_len_;
+
+  while (i <= remain_size) {
+    int cur_node_idx = cursor_ % node_type_len;
+    int node_type = first_node_type_[cur_node_idx];
+    auto &path = meta_path_[cur_node_idx];
+    size_t start = node_type_start_[node_type];
+    // auto node_query_result = gpu_graph_ptr->query_node_list(
+    //    gpuid_, node_type, start, once_sample_startid_len_);
+
+    // int tmp_len = node_query_result.actual_sample_size;
+    VLOG(2) << "choose start type: " << node_type;
+    int type_index = type_to_index_[node_type];
+    size_t device_key_size = h_device_keys_[type_index]->size();
+    VLOG(2) << "type: " << node_type << " size: " << device_key_size
+            << " start: " << start;
+    uint64_t *d_type_keys =
+        reinterpret_cast<uint64_t *>(d_device_keys_[type_index]->ptr());
+    int tmp_len = start + once_sample_startid_len_ > device_key_size
+                      ? device_key_size - start
+                      : once_sample_startid_len_;
+    node_type_start_[node_type] = tmp_len + start;
+    if (tmp_len == 0) {
+      finish_node_type_.insert(node_type);
+      if (finish_node_type_.size() == node_type_start_.size()) {
+        break;
+      }
+      cursor_ += 1;
+      continue;
+    }
+    // if (tmp_len == 0) {
+    //  break;
+    //}
+    VLOG(2) << "i = " << i << " buf_size_ = " << buf_size_
+            << " tmp_len = " << tmp_len << " cursor = " << cursor_
+            << " once_max_sample_keynum = " << once_max_sample_keynum;
+    uint64_t *cur_walk = walk + i;
+
+    NeighborSampleQuery q;
+    q.initialize(gpuid_, path[0], (uint64_t)(d_type_keys + start), walk_degree_,
+                 tmp_len);
+    auto sample_res = gpu_graph_ptr->graph_neighbor_sample_v3(q, false);
+
+    int step = 1;
+    VLOG(2) << "sample edge type: " << path[0] << " step: " << 1;
+    jump_rows_ = sample_res.total_sample_size;
+    FillOneStep(d_type_keys + start, cur_walk, tmp_len, sample_res,
+                walk_degree_, step, len_per_row);
+    VLOG(2) << "jump_row: " << jump_rows_;
+    /////////
+    if (debug_mode_) {
+      cudaMemcpy(h_walk, walk, buf_size_ * sizeof(uint64_t),
+                 cudaMemcpyDeviceToHost);
+      for (int xx = 0; xx < buf_size_; xx++) {
+        VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx];
+      }
+    }
+    /////////
+    step++;
+    size_t path_len = path.size();
+    for (; step < walk_len_; step++) {
+      if (sample_res.total_sample_size == 0) {
+        break;
+      }
+      auto sample_key_mem = sample_res.actual_val_mem;
+      uint64_t *sample_keys_ptr =
+          reinterpret_cast<uint64_t *>(sample_key_mem->ptr());
+      int edge_type_id = path[(step - 1) % path_len];
+      VLOG(2) << "sample edge type: " << edge_type_id << " step: " << step;
+      q.initialize(gpuid_, edge_type_id, (uint64_t)sample_keys_ptr, 1,
+                   sample_res.total_sample_size);
+      sample_res = gpu_graph_ptr->graph_neighbor_sample_v3(q, false);
+
+      FillOneStep(d_type_keys + start, cur_walk, sample_res.total_sample_size,
+                  sample_res, 1, step, len_per_row);
+      if (debug_mode_) {
+        cudaMemcpy(h_walk, walk, buf_size_ * sizeof(uint64_t),
+                   cudaMemcpyDeviceToHost);
+        for (int xx = 0; xx < buf_size_; xx++) {
+          VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx];
+        }
+      }
+    }
+    // cursor_ += tmp_len;
+    i += jump_rows_ * walk_len_;
+    total_row += jump_rows_;
+    cursor_ += 1;
+  }
+  buf_state_.Reset(total_row);
+  int *d_random_row = reinterpret_cast<int *>(d_random_row_->ptr());
+
+  thrust::random::default_random_engine engine(shuffle_seed_);
+  const auto &exec_policy = thrust::cuda::par.on(stream_);
+  thrust::counting_iterator<int> cnt_iter(0);
+  thrust::shuffle_copy(exec_policy, cnt_iter, cnt_iter + total_row,
+                       thrust::device_pointer_cast(d_random_row), engine);
+
+  cudaStreamSynchronize(stream_);
+  shuffle_seed_ = engine();
+
+  if (debug_mode_) {
+    int *h_random_row = new int[total_row + 10];
+    cudaMemcpy(h_random_row, d_random_row, total_row * sizeof(int),
+               cudaMemcpyDeviceToHost);
+    for (int xx = 0; xx < total_row; xx++) {
+      VLOG(2) << "h_random_row[" << xx << "]: " << h_random_row[xx];
+    }
+    delete[] h_random_row;
+    delete[] h_walk;
+    delete[] h_sample_keys;
+    delete[] h_offset2idx;
+    delete[] h_len_per_row;
+    delete[] h_prefix_sum;
+  }
+  return total_row != 0;
+}
+
+void GraphDataGenerator::AllocResource(const paddle::platform::Place &place,
+                                       std::vector<LoDTensor *> feed_vec) {
+  place_ = place;
+  gpuid_ = place_.GetDeviceId();
+  VLOG(3) << "gpuid " << gpuid_;
+  stream_ = dynamic_cast<platform::CUDADeviceContext *>(
+                platform::DeviceContextPool::Instance().Get(place))
+                ->stream();
+  feed_vec_ = feed_vec;
+  slot_num_ = (feed_vec_.size() - 3) / 2;
+
+  // d_device_keys_.resize(h_device_keys_.size());
+  VLOG(2) << "h_device_keys size: " << h_device_keys_.size();
+
+  for (size_t i = 0; i < h_device_keys_.size(); i++) {
+    for (size_t j = 0; j < h_device_keys_[i]->size(); j++) {
+      VLOG(3) << "h_device_keys_[" << i << "][" << j
+              << "] = " << (*(h_device_keys_[i]))[j];
+    }
+    auto buf = memory::AllocShared(
+        place_, h_device_keys_[i]->size() * sizeof(uint64_t));
+    d_device_keys_.push_back(buf);
+    CUDA_CHECK(cudaMemcpyAsync(buf->ptr(), h_device_keys_[i]->data(),
+                               h_device_keys_[i]->size() * sizeof(uint64_t),
+                               cudaMemcpyHostToDevice, stream_));
+  }
+  // h_device_keys_ = h_device_keys;
+  // device_key_size_ = h_device_keys_->size();
+  // d_device_keys_ =
+  //    memory::AllocShared(place_, device_key_size_ * sizeof(int64_t));
+  // CUDA_CHECK(cudaMemcpyAsync(d_device_keys_->ptr(), h_device_keys_->data(),
+  //                           device_key_size_ * sizeof(int64_t),
+  //                           cudaMemcpyHostToDevice, stream_));
+  size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_;
+  d_prefix_sum_ =
+      memory::AllocShared(place_, (once_max_sample_keynum + 1) * sizeof(int));
+  int *d_prefix_sum_ptr = reinterpret_cast<int *>(d_prefix_sum_->ptr());
+  cudaMemsetAsync(d_prefix_sum_ptr, 0,
+                  (once_max_sample_keynum + 1) * sizeof(int), stream_);
+  cursor_ = 0;
+  jump_rows_ = 0;
+  d_walk_ = memory::AllocShared(place_, buf_size_ * sizeof(uint64_t));
+  cudaMemsetAsync(d_walk_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_);
+  d_feature_ = memory::AllocShared(place_, buf_size_ * slot_num_ * sizeof(uint64_t));
+  cudaMemsetAsync(d_feature_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_);
+  d_sample_keys_ =
+      memory::AllocShared(place_, once_max_sample_keynum * sizeof(uint64_t));
+
+  d_sampleidx2rows_.push_back(
+      memory::AllocShared(place_, once_max_sample_keynum * sizeof(int)));
+  d_sampleidx2rows_.push_back(
+      memory::AllocShared(place_, once_max_sample_keynum * sizeof(int)));
+  cur_sampleidx2row_ = 0;
+
+  d_len_per_row_ =
+      memory::AllocShared(place_, once_max_sample_keynum * sizeof(int));
+  for (int i = -window_; i < 0; i++) {
+    window_step_.push_back(i);
+  }
+  for (int i = 0; i < window_; i++) {
+    window_step_.push_back(i + 1);
+  }
+  buf_state_.Init(batch_size_, walk_len_, &window_step_);
+  d_random_row_ = memory::AllocShared(
+      place_,
+      (once_sample_startid_len_ * walk_degree_ * repeat_time_) * sizeof(int));
+  shuffle_seed_ = 0;
+
+  ins_buf_pair_len_ = 0;
+  d_ins_buf_ =
+      memory::AllocShared(place_, (batch_size_ * 2 * 2) * sizeof(int64_t));
+  d_feature_buf_ =
+      memory::AllocShared(place_, (batch_size_ * 2 * 2) * slot_num_ * sizeof(int64_t));
+  d_pair_num_ = memory::AllocShared(place_, sizeof(int));
+
+  cudaStreamSynchronize(stream_);
+}
+
+void GraphDataGenerator::SetConfig(
+    const paddle::framework::DataFeedDesc &data_feed_desc) {
+  auto graph_config = data_feed_desc.graph_config();
+  walk_degree_ = graph_config.walk_degree();
+  walk_len_ = graph_config.walk_len();
+  window_ = graph_config.window();
+  once_sample_startid_len_ = graph_config.once_sample_startid_len();
+  debug_mode_ = graph_config.debug_mode();
+  if (debug_mode_) {
+    batch_size_ = graph_config.batch_size();
+  } else {
+    batch_size_ = once_sample_startid_len_;
+  }
+  repeat_time_ = graph_config.sample_times_one_chunk();
+  buf_size_ =
+      once_sample_startid_len_ * walk_len_ * walk_degree_ * repeat_time_;
+  VLOG(2) << "Confirm GraphConfig, walk_degree : " << walk_degree_
+          << ", walk_len : " << walk_len_ << ", window : " << window_
+          << ", once_sample_startid_len : " << once_sample_startid_len_
+          << ", sample_times_one_chunk : " << repeat_time_
+          << ", batch_size: " << batch_size_;
+  std::string first_node_type = graph_config.first_node_type();
+  std::string meta_path = graph_config.meta_path();
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  auto edge_to_id = gpu_graph_ptr->edge_to_id;
+  auto node_to_id = gpu_graph_ptr->feature_to_id;
+  // parse first_node_type
+  auto node_types =
+      paddle::string::split_string<std::string>(first_node_type, ";");
+  VLOG(2) << "node_types: " << first_node_type;
+  finish_node_type_.clear();
+  node_type_start_.clear();
+  for (auto &type : node_types) {
+    auto iter = node_to_id.find(type);
+    PADDLE_ENFORCE_NE(
+        iter, node_to_id.end(),
+        platform::errors::NotFound("(%s) is not found in node_to_id.", type));
+    VLOG(2) << "node_to_id[" << type << "] = " << iter->second;
+    first_node_type_.push_back(iter->second);
+    node_type_start_[iter->second] = 0;
+  }
+  meta_path_.resize(first_node_type_.size());
+  auto meta_paths = paddle::string::split_string<std::string>(meta_path, ";");
+
+  for (size_t i = 0; i < meta_paths.size(); i++) {
+    auto path = meta_paths[i];
+    auto nodes = paddle::string::split_string<std::string>(path, "-");
+    for (auto &node : nodes) {
+      auto iter = edge_to_id.find(node);
+      PADDLE_ENFORCE_NE(
+          iter, edge_to_id.end(),
+          platform::errors::NotFound("(%s) is not found in edge_to_id.", node));
+      VLOG(2) << "edge_to_id[" << node << "] = " << iter->second;
+      meta_path_[i].push_back(iter->second);
+    }
+  }
+};
+
 }  // namespace framework
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 6f7f1dac52804f..9c44de182e1587 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <future>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
+#include <random>
 #include <sstream>
 #include <string>
 #include <thread>  // NOLINT
@@ -56,6 +57,8 @@ namespace framework {
 class DataFeedDesc;
 class Scope;
 class Variable;
+class NeighborSampleResult;
+class NodeQueryResult;
 }  // namespace framework
 }  // namespace paddle
 
@@ -774,6 +777,190 @@ class DLManager {
   std::map<std::string, DLHandle> handle_map_;
 };
 
+struct engine_wrapper_t {
+  std::default_random_engine engine;
+  engine_wrapper_t() {
+    struct timespec tp;
+    clock_gettime(CLOCK_REALTIME, &tp);
+    double cur_time = tp.tv_sec + tp.tv_nsec * 1e-9;
+    static std::atomic<uint64_t> x(0);
+    std::seed_seq sseq = {x++, x++, x++, (uint64_t)(cur_time * 1000)};
+    engine.seed(sseq);
+  }
+};
+
+struct BufState {
+  int left;
+  int right;
+  int central_word;
+  int step;
+  engine_wrapper_t random_engine_;
+
+  int len;
+  int cursor;
+  int row_num;
+
+  int batch_size;
+  int walk_len;
+  std::vector<int>* window;
+
+  BufState() {}
+  ~BufState() {}
+
+  void Init(int graph_batch_size, int graph_walk_len,
+            std::vector<int>* graph_window) {
+    batch_size = graph_batch_size;
+    walk_len = graph_walk_len;
+    window = graph_window;
+
+    left = 0;
+    right = window->size() - 1;
+    central_word = -1;
+    step = -1;
+
+    len = 0;
+    cursor = 0;
+    row_num = 0;
+    for (size_t i = 0; i < graph_window->size(); i++) {
+      VLOG(2) << "graph_window[" << i << "] = " << (*graph_window)[i];
+    }
+  }
+
+  void Reset(int total_rows) {
+    cursor = 0;
+    row_num = total_rows;
+    int tmp_len = cursor + batch_size > row_num ? row_num - cursor : batch_size;
+    len = tmp_len;
+    central_word = -1;
+    step = -1;
+    GetNextCentrolWord();
+  }
+
+  int GetNextStep() {
+    step++;
+    if (step <= right && central_word + (*window)[step] < walk_len) {
+      return 1;
+    }
+    return 0;
+  }
+
+  void Debug() {
+    VLOG(2) << "left: " << left << " right: " << right
+            << " central_word: " << central_word << " step: " << step
+            << " cursor: " << cursor << " len: " << len
+            << " row_num: " << row_num;
+  }
+
+  int GetNextCentrolWord() {
+    if (++central_word >= walk_len) {
+      return 0;
+    }
+    int window_size = window->size() / 2;
+    int random_window = random_engine_.engine() % window_size + 1;
+    left = window_size - random_window;
+    right = window_size + random_window - 1;
+    VLOG(2) << "random window: " << random_window << " window[" << left
+            << "] = " << (*window)[left] << " window[" << right
+            << "] = " << (*window)[right];
+
+    for (step = left; step <= right; step++) {
+      if (central_word + (*window)[step] >= 0) {
+        return 1;
+      }
+    }
+    return 0;
+  }
+
+  int GetNextBatch() {
+    cursor += len;
+    int tmp_len = cursor + batch_size > row_num ? row_num - cursor : batch_size;
+    if (tmp_len == 0) {
+      return 0;
+    }
+    len = tmp_len;
+    central_word = -1;
+    step = -1;
+    GetNextCentrolWord();
+    return tmp_len != 0;
+  }
+};
+
+class GraphDataGenerator {
+ public:
+  GraphDataGenerator(){};
+  virtual ~GraphDataGenerator(){};
+  void SetConfig(const paddle::framework::DataFeedDesc& data_feed_desc);
+  void AllocResource(const paddle::platform::Place& place,
+                     std::vector<LoDTensor*> feed_vec);
+  int AcquireInstance(BufState* state);
+  int GenerateBatch();
+  int FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk);
+  int FillFeatureBuf(std::shared_ptr<phi::Allocation> d_walk,
+          std::shared_ptr<phi::Allocation> d_feature);
+  void FillOneStep(uint64_t* start_ids, uint64_t* walk, int len,
+                   NeighborSampleResult& sample_res, int cur_degree, int step,
+                   int* len_per_row);
+  int FillInsBuf();
+  void SetDeviceKeys(std::vector<uint64_t>* device_keys, int type) {
+    type_to_index_[type] = h_device_keys_.size();
+    h_device_keys_.push_back(device_keys);
+  }
+
+ protected:
+  int walk_degree_;
+  int walk_len_;
+  int window_;
+  int once_sample_startid_len_;
+  int gpuid_;
+  // start ids
+  // int64_t* device_keys_;
+  // size_t device_key_size_;
+  std::vector<std::vector<uint64_t>*> h_device_keys_;
+  std::unordered_map<int, int> type_to_index_;
+  // point to device_keys_
+  size_t cursor_;
+  size_t jump_rows_;
+  int64_t* id_tensor_ptr_;
+  int64_t* show_tensor_ptr_;
+  int64_t* clk_tensor_ptr_;
+  cudaStream_t stream_;
+  paddle::platform::Place place_;
+  std::vector<LoDTensor*> feed_vec_;
+  std::vector<size_t> offset_;
+  std::shared_ptr<phi::Allocation> d_prefix_sum_;
+  std::vector<std::shared_ptr<phi::Allocation>> d_device_keys_;
+
+  std::shared_ptr<phi::Allocation> d_walk_;
+  std::shared_ptr<phi::Allocation> d_feature_;
+  std::shared_ptr<phi::Allocation> d_len_per_row_;
+  std::shared_ptr<phi::Allocation> d_random_row_;
+  //
+  std::vector<std::shared_ptr<phi::Allocation>> d_sampleidx2rows_;
+  int cur_sampleidx2row_;
+  // record the keys to call graph_neighbor_sample
+  std::shared_ptr<phi::Allocation> d_sample_keys_;
+  int sample_keys_len_;
+
+  std::set<int> finish_node_type_;
+  std::unordered_map<int, size_t> node_type_start_;
+
+  std::shared_ptr<phi::Allocation> d_ins_buf_;
+  std::shared_ptr<phi::Allocation> d_feature_buf_;
+  std::shared_ptr<phi::Allocation> d_pair_num_;
+  int ins_buf_pair_len_;
+  // size of a d_walk buf
+  size_t buf_size_;
+  int repeat_time_;
+  std::vector<int> window_step_;
+  BufState buf_state_;
+  int batch_size_;
+  int slot_num_;
+  int shuffle_seed_;
+  int debug_mode_;
+  std::vector<int> first_node_type_;
+  std::vector<std::vector<int>> meta_path_;
+};
+
 class DataFeed {
  public:
   DataFeed() {
@@ -836,6 +1023,12 @@ class DataFeed {
   virtual void SetParseLogKey(bool parse_logkey) {}
   virtual void SetEnablePvMerge(bool enable_pv_merge) {}
   virtual void SetCurrentPhase(int current_phase) {}
+  virtual void SetDeviceKeys(std::vector<uint64_t>* device_keys, int type) {
+    gpu_graph_data_generator_.SetDeviceKeys(device_keys, type);
+  }
+  virtual void SetGpuGraphMode(int gpu_graph_mode) {
+    gpu_graph_mode_ = gpu_graph_mode;
+  }
   virtual void SetFileListMutex(std::mutex* mutex) {
     mutex_for_pick_file_ = mutex;
   }
@@ -919,6 +1112,8 @@ class DataFeed {
 
   // The input type of pipe reader, 0 for one sample, 1 for one batch
   int input_type_;
+  int gpu_graph_mode_ = 0;
+  GraphDataGenerator gpu_graph_data_generator_;
 };
 
 // PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto
index 6964446f20946f..fe606630f92188 100644
--- a/paddle/fluid/framework/data_feed.proto
+++ b/paddle/fluid/framework/data_feed.proto
@@ -27,6 +27,18 @@ message MultiSlotDesc {
   optional string uid_slot = 2;
 }
 
+message GraphConfig {
+  optional int32 walk_degree = 1 [ default = 1 ];
+  optional int32 walk_len = 2 [ default = 20 ];
+  optional int32 window = 3 [ default = 5 ];
+  optional int32 once_sample_startid_len = 4 [ default = 8000 ];
+  optional int32 sample_times_one_chunk = 5 [ default = 10 ];
+  optional int32 batch_size = 6 [ default = 1 ];
+  optional int32 debug_mode = 7 [ default = 0 ];
+  optional string first_node_type = 8;
+  optional string meta_path = 9;
+}
+
 message DataFeedDesc {
   optional string name = 1;
   optional int32 batch_size = 2 [ default = 32 ];
@@ -37,4 +49,5 @@ message DataFeedDesc {
   optional int32 pv_batch_size = 7 [ default = 32 ];
   optional int32 input_type = 8 [ default = 0 ];
   optional string so_parser_name = 9;
+  optional GraphConfig graph_config = 10;
 }
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index b4ae9949f2c6e6..beb0cc316da4b0 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -25,6 +25,7 @@
 
 #ifdef PADDLE_WITH_PSCORE
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
 #endif
 
 #if defined _WIN32 || defined __APPLE__
@@ -120,6 +121,24 @@ void DatasetImpl<T>::SetDataFeedDesc(const std::string& data_feed_desc_str) {
                                                 &data_feed_desc_);
 }
 
+template <typename T>
+std::vector<std::string> DatasetImpl<T>::GetSlots() {
+  auto multi_slot_desc = data_feed_desc_.multi_slot_desc();
+  use_slots_.clear();
+  for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
+    const auto& slot = multi_slot_desc.slots(i);
+    if (slot.type() == "uint64" || slot.type() == "uint32") {
+      use_slots_.push_back(slot.name());
+    }
+  }
+  std::cout << "dataset use slots: ";
+  for (auto s : use_slots_) {
+    std::cout << s << " | ";
+  }
+  std::cout << " end " << std::endl;
+  return use_slots_;
+}
+
 template <typename T>
 void DatasetImpl<T>::SetChannelNum(int channel_num) {
   channel_num_ = channel_num;
@@ -302,12 +321,11 @@ static int compute_thread_batch_nccl(
   thread_avg_batch_num = static_cast<int>(offset.size() / thr_num);
 #ifdef PADDLE_WITH_GLOO
   auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance();
-  if (!gloo_wrapper->IsInitialized()) {
-    VLOG(0) << "GLOO is not inited";
-    gloo_wrapper->Init();
-  }
-
   if (gloo_wrapper->Size() > 1) {
+    if (!gloo_wrapper->IsInitialized()) {
+      VLOG(0) << "GLOO is not inited";
+      gloo_wrapper->Init();
+    }
     // adjust batch num per thread for NCCL
     std::vector<int> thread_avg_batch_num_vec(1, thread_avg_batch_num);
     std::vector<int64_t> total_instance_num_vec(1, total_instance_num);
@@ -409,6 +427,18 @@ void MultiSlotDataset::PrepareTrain() {
   return;
 }
 
+template <typename T>
+void DatasetImpl<T>::SetGraphDeviceKeys(
+    const std::vector<uint64_t>& h_device_keys) {
+  //  for (size_t i = 0; i < gpu_graph_device_keys_.size(); i++) {
+  //    gpu_graph_device_keys_[i].clear();
+  //  }
+  //  size_t device_num = gpu_graph_device_keys_.size();
+  //  for (size_t i = 0; i < h_device_keys.size(); i++) {
+  //    int shard = h_device_keys[i] % device_num;
+  //    gpu_graph_device_keys_[shard].push_back(h_device_keys[i]);
+  //  }
+}
 // load data into memory, Dataset hold this memory,
 // which will later be fed into readers' channel
 template <typename T>
@@ -417,12 +447,70 @@ void DatasetImpl<T>::LoadIntoMemory() {
   platform::Timer timeline;
   timeline.Start();
   std::vector<std::thread> load_threads;
-  for (int64_t i = 0; i < thread_num_; ++i) {
-    load_threads.push_back(std::thread(
-        &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get()));
-  }
-  for (std::thread& t : load_threads) {
-    t.join();
+  if (gpu_graph_mode_) {
+    VLOG(0) << "in gpu_graph_mode";
+    graph_all_type_total_keys_.clear();
+    auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+    auto node_to_id = gpu_graph_ptr->feature_to_id;
+    auto edge_to_id = gpu_graph_ptr->edge_to_id;
+    graph_all_type_total_keys_.resize(node_to_id.size());
+    int cnt = 0;
+    for (auto& iter : node_to_id) {
+      int node_idx = iter.second;
+      auto gpu_graph_device_keys =
+          gpu_graph_ptr->get_all_id(1, node_idx, thread_num_);
+      auto& type_total_key = graph_all_type_total_keys_[cnt];
+      type_total_key.resize(thread_num_);
+      for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
+        VLOG(2) << "node type: " << node_idx << ", gpu_graph_device_keys[" << i
+                << "] = " << gpu_graph_device_keys[i].size();
+        for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
+          gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]);
+          type_total_key[i].push_back(gpu_graph_device_keys[i][j]);
+        }
+      }
+      for (size_t i = 0; i < readers_.size(); i++) {
+        readers_[i]->SetDeviceKeys(&type_total_key[i], node_idx);
+        readers_[i]->SetGpuGraphMode(gpu_graph_mode_);
+      }
+      cnt++;
+    }
+    //TODO(huwei02): open it when slot fea ready
+    //for (auto& iter : node_to_id) {
+    //  int node_idx = iter.second;
+    //  auto gpu_graph_device_keys =
+    //      gpu_graph_ptr->get_all_feature_ids(1, node_idx, thread_num_);
+    //  for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
+    //    VLOG(2) << "node type: " << node_idx << ", gpu_graph_device_keys[" << i
+    //            << "] = " << gpu_graph_device_keys[i].size();
+    //    for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
+    //      gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]);
+    //    }
+    //  }
+    //}
+
+    // FIX: trick for iterate edge table
+    for (auto& iter : edge_to_id) {
+      int edge_idx = iter.second;
+      auto gpu_graph_device_keys =
+          gpu_graph_ptr->get_all_id(0, edge_idx, thread_num_);
+      for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
+        VLOG(1) << "edge type: " << edge_idx << ", gpu_graph_device_keys[" << i
+                << "] = " << gpu_graph_device_keys[i].size();
+        for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
+          gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]);
+        }
+      }
+    }
+
+  } else {
+    for (int64_t i = 0; i < thread_num_; ++i) {
+      load_threads.push_back(std::thread(
+          &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get()));
+    }
+    for (std::thread& t : load_threads) {
+      t.join();
+    }
   }
   input_channel_->Close();
   int64_t in_chan_size = input_channel_->Size();
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 3f10cd7765bc1f..0d326d3fd1364a 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -152,12 +152,16 @@ class Dataset {
   virtual void DestroyPreLoadReaders() = 0;
   // set preload thread num
   virtual void SetPreLoadThreadNum(int thread_num) = 0;
-  // separate train thread and dataset thread
+  // seperate train thread and dataset thread
   virtual void DynamicAdjustChannelNum(int channel_num,
                                        bool discard_remaining_ins = false) = 0;
   virtual void DynamicAdjustReadersNum(int thread_num) = 0;
   // set fleet send sleep seconds
   virtual void SetFleetSendSleepSeconds(int seconds) = 0;
+  virtual void SetGraphDeviceKeys(
+      const std::vector<uint64_t>& h_device_keys) = 0;
+
+  virtual std::vector<std::string> GetSlots() = 0;
 
  protected:
   virtual int ReceiveFromClient(int msg_type, int client_id,
@@ -238,6 +242,7 @@ class DatasetImpl : public Dataset {
                                          int read_thread_num,
                                          int consume_thread_num,
                                          int shard_num) {}
+  virtual void SetGraphDeviceKeys(const std::vector<uint64_t>& h_device_keys);
   virtual void ClearLocalTables() {}
   virtual void CreatePreLoadReaders();
   virtual void DestroyPreLoadReaders();
@@ -246,6 +251,7 @@ class DatasetImpl : public Dataset {
                                        bool discard_remaining_ins = false);
   virtual void DynamicAdjustReadersNum(int thread_num);
   virtual void SetFleetSendSleepSeconds(int seconds);
+  virtual std::vector<std::string> GetSlots();
   /* for enable_heterps_
   virtual void EnableHeterps(bool enable_heterps) {
     enable_heterps_ = enable_heterps;
@@ -263,7 +269,9 @@ class DatasetImpl : public Dataset {
       return multi_consume_channel_;
     }
   }
-
+  std::vector<uint64_t>& GetGpuGraphTotalKeys() {
+    return gpu_graph_total_keys_;
+  }
   Channel<T>& GetInputChannelRef() { return input_channel_; }
 
  protected:
@@ -321,7 +329,12 @@ class DatasetImpl : public Dataset {
   int64_t global_index_ = 0;
   std::vector<std::shared_ptr<ThreadPool>> consume_task_pool_;
   std::vector<T> input_records_;  // only for paddleboxdatafeed
+  std::vector<std::string> use_slots_;
   bool enable_heterps_ = false;
+  int gpu_graph_mode_ = 1;
+  // std::vector<std::vector<int64_t>> gpu_graph_device_keys_;
+  std::vector<std::vector<std::vector<uint64_t>>> graph_all_type_total_keys_;
+  std::vector<uint64_t> gpu_graph_total_keys_;
 };
 
 // use std::vector<MultiSlotType> or Record as data type
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 4fddfca5d805ac..37ec4666a30d67 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -69,7 +69,7 @@ void FleetWrapper::InitWorker(const std::string& dist_desc,
                               int node_num, int index) {
 #ifdef PADDLE_WITH_PSLIB
   if (!is_initialized_) {
-    VLOG(3) << "Going to init worker";
+    VLOG(0) << "Going to init worker";
     pslib_ptr_ = std::shared_ptr<paddle::distributed::PSlib>(
         new paddle::distributed::PSlib());
     pslib_ptr_->init_worker(dist_desc,
@@ -126,7 +126,7 @@ void FleetWrapper::GatherServers(const std::vector<uint64_t>& host_sign_list,
 
 void FleetWrapper::GatherClients(const std::vector<uint64_t>& host_sign_list) {
 #ifdef PADDLE_WITH_PSLIB
-  VLOG(3) << "Going to gather client ips";
+  VLOG(0) << "Going to gather client ips";
   size_t len = host_sign_list.size();
   pslib_ptr_->gather_clients(const_cast<uint64_t*>(host_sign_list.data()), len);
 #endif
@@ -142,7 +142,7 @@ std::vector<uint64_t> FleetWrapper::GetClientsInfo() {
 
 void FleetWrapper::CreateClient2ClientConnection() {
 #ifdef PADDLE_WITH_PSLIB
-  VLOG(3) << "Going to create client2client connection";
+  VLOG(0) << "Going to create client2client connection";
   pslib_ptr_->create_client2client_connection(client2client_request_timeout_ms_,
                                               client2client_connect_timeout_ms_,
                                               client2client_max_retry_);
@@ -1054,7 +1054,8 @@ void FleetWrapper::PushSparseFromTensorWithLabelAsync(
   int slot_offset = 0;
   int grad_dim = 0;
   // don't worry, user do not have to care about all these flags
-  if (accesor == "DownpourCtrAccessor") {
+  if (accesor == "DownpourCtrAccessor" ||
+      accesor == "DownpourCtrDymfAccessor") {
     dump_slot = true;
     slot_offset = 1;
     grad_dim = fea_dim - 2;
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index 3fdcf2379cb54a..823b60c5ef1f24 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -95,24 +95,6 @@ class HeterContext {
   }
   void SetShardNum(uint32_t shard_num) { shard_num_ = shard_num; }
   uint32_t ShardNum() { return shard_num_; }
-  void init(int shard_num, int device_num) {
-    shard_num_ = shard_num;
-    feature_keys_.resize(shard_num_);
-    value_ptr_.resize(shard_num_);
-    device_task_ptr_.resize(shard_num_);
-    device_task_keys_.resize(shard_num_);
-    for (size_t i = 0; i < device_task_ptr_.size(); i++) {
-      device_task_ptr_[i].resize(device_num);
-      device_task_keys_[i].resize(device_num);
-    }
-
-    device_values_.resize(device_num);
-    device_keys_.resize(device_num);
-    mutex_.resize(device_num);
-    for (size_t i = 0; i < mutex_.size(); ++i) {
-      mutex_[i] = new std::mutex();
-    }
-  }
 
   void init(int shard_num, int device_num, int dim_num) {
     shard_num_ = shard_num;
@@ -129,11 +111,6 @@ class HeterContext {
     for (size_t i = 0; i < feature_dim_keys_.size(); i++) {
       feature_dim_keys_[i].resize(dim_num);
       value_dim_ptr_[i].resize(dim_num);
-      if (i == 0) {
-        for (int j = 0; j < dim_num; j++) {
-          feature_dim_keys_[i][j].push_back(0);
-        }
-      }
     }
     device_values_.resize(device_num);
     device_dim_values_.resize(device_num);
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
index b633394e7a8117..cb7f3a40d6720b 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -32,17 +32,33 @@ struct FeatureValue {
   float lr;
   float lr_g2sum;
   int mf_size;
-  float mf[MF_DIM + 1];
+  int mf_dim;
   uint64_t cpu_ptr;
+  float mf[0];
 
   friend std::ostream& operator<<(std::ostream& out, FeatureValue& val) {
     out << "show: " << val.show << " clk: " << val.clk << " slot: " << val.slot
-        << " lr: " << val.lr << " mf_size: " << val.mf_size << " mf:";
-    for (int i = 0; i < val.mf_size; ++i) {
+        << " lr: " << val.lr << " mf_dim: " << val.mf_dim
+        << "cpuptr: " << val.cpu_ptr << " mf_size: " << val.mf_size << " mf:";
+    for (int i = 0; i < val.mf_dim + 1; ++i) {
       out << " " << val.mf[i];
     }
     return out;
   }
+  __device__ __forceinline__ void operator=(const FeatureValue& in) {
+    delta_score = in.delta_score;
+    show = in.show;
+    clk = in.clk;
+    slot = in.slot;
+    lr = in.lr;
+    lr_g2sum = in.lr_g2sum;
+    mf_size = in.mf_size;
+    mf_dim = in.mf_dim;
+    cpu_ptr = in.cpu_ptr;
+    for (int i = 0; i < mf_dim + 1; i++) {
+      mf[i] = in.mf[i];
+    }
+  }
 };
 
 struct FeaturePushValue {
@@ -50,20 +66,33 @@ struct FeaturePushValue {
   float clk;
   int slot;
   float lr_g;
-  float mf_g[MF_DIM];
+  int mf_dim;
+  float mf_g[0];
 
-  // __device__ __forceinline__ FeaturePushValue
-  // operator+(const FeaturePushValue& a) const {
-  //  FeaturePushValue out;
-  //  out.slot = a.slot;
-  //  out.show = a.show + show;
-  //  out.clk = a.clk + clk;
-  //  out.lr_g = a.lr_g + lr_g;
-  //  for (int i = 0; i < MF_DIM; ++i) {
-  //    out.mf_g[i] = a.mf_g[i] + mf_g[i];
-  //  }
-  //  return out;
-  // }
+  __device__ __forceinline__ FeaturePushValue
+  operator+(const FeaturePushValue& a) const {
+    FeaturePushValue out;
+    out.slot = a.slot;
+    out.mf_dim = a.mf_dim;
+    out.show = a.show + show;
+    out.clk = a.clk + clk;
+    out.lr_g = a.lr_g + lr_g;
+    // out.mf_g = a.mf_g;
+    for (int i = 0; i < out.mf_dim; ++i) {
+      out.mf_g[i] = a.mf_g[i] + mf_g[i];
+    }
+    return out;
+  }
+  __device__ __forceinline__ void operator=(const FeaturePushValue& in) {
+    show = in.show;
+    clk = in.clk;
+    slot = in.slot;
+    lr_g = in.lr_g;
+    mf_dim = in.mf_dim;
+    for (int i = 0; i < mf_dim; i++) {
+      mf_g[i] = in.mf_g[i];
+    }
+  }
 };
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
index 19c355c671a386..dcdca8944b1424 100644
--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -20,23 +20,24 @@
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/phi/core/enforce.h"
 namespace paddle {
 namespace framework {
 struct GpuPsGraphNode {
-  int64_t node_id;
+  uint64_t node_id;
   int64_t neighbor_size, neighbor_offset;
   // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
   // neighbor_size) of int64_t *neighbor_list;
 };
 
 struct GpuPsCommGraph {
-  int64_t *neighbor_list;
+  uint64_t *neighbor_list;
   GpuPsGraphNode *node_list;
   int64_t neighbor_size, node_size;
   // the size of neighbor array and graph_node_list array
   GpuPsCommGraph()
       : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
-  GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
+  GpuPsCommGraph(uint64_t *neighbor_list_, GpuPsGraphNode *node_list_,
                  int64_t neighbor_size_, int64_t node_size_)
       : neighbor_list(neighbor_list_),
         node_list(node_list_),
@@ -45,7 +46,7 @@ struct GpuPsCommGraph {
   void init_on_cpu(int64_t neighbor_size, int64_t node_size) {
     this->neighbor_size = neighbor_size;
     this->node_size = node_size;
-    this->neighbor_list = new int64_t[neighbor_size];
+    this->neighbor_list = new uint64_t[neighbor_size];
     this->node_list = new paddle::framework::GpuPsGraphNode[node_size];
   }
   void release_on_cpu() {
@@ -55,15 +56,15 @@ struct GpuPsCommGraph {
   void display_on_cpu() {
     VLOG(0) << "neighbor_size = " << neighbor_size;
     VLOG(0) << "node_size = " << node_size;
-    for (size_t i = 0; i < neighbor_size; i++) {
+    for (int64_t i = 0; i < neighbor_size; i++) {
       VLOG(0) << "neighbor " << i << " " << neighbor_list[i];
     }
-    for (size_t i = 0; i < node_size; i++) {
+    for (int64_t i = 0; i < node_size; i++) {
       VLOG(0) << "node i " << node_list[i].node_id
               << " neighbor_size = " << node_list[i].neighbor_size;
       std::string str;
       int offset = node_list[i].neighbor_offset;
-      for (size_t j = 0; j < node_list[i].neighbor_size; j++) {
+      for (int64_t j = 0; j < node_list[i].neighbor_size; j++) {
         if (j > 0) str += ",";
         str += std::to_string(neighbor_list[j + offset]);
       }
@@ -123,21 +124,25 @@ node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
 */
 struct NeighborSampleQuery {
   int gpu_id;
-  int64_t *key;
-  int sample_size;
+  int table_idx;
+  uint64_t *src_nodes;
   int len;
-  void initialize(int gpu_id, int64_t key, int sample_size, int len) {
+  int sample_size;
+  void initialize(int gpu_id, int table_idx, uint64_t src_nodes,
+                  int sample_size, int len) {
+    this->table_idx = table_idx;
     this->gpu_id = gpu_id;
-    this->key = (int64_t *)key;
+    this->src_nodes = (uint64_t *)src_nodes;
     this->sample_size = sample_size;
     this->len = len;
   }
   void display() {
-    int64_t *sample_keys = new int64_t[len];
+    uint64_t *sample_keys = new uint64_t[len];
     VLOG(0) << "device_id " << gpu_id << " sample_size = " << sample_size;
-    VLOG(0) << "there are " << len << " keys ";
+    VLOG(0) << "there are " << len << " keys to sample for graph " << table_idx;
     std::string key_str;
-    cudaMemcpy(sample_keys, key, len * sizeof(int64_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(sample_keys, src_nodes, len * sizeof(uint64_t),
+               cudaMemcpyDeviceToHost);
 
     for (int i = 0; i < len; i++) {
       if (key_str.size() > 0) key_str += ";";
@@ -148,14 +153,14 @@ struct NeighborSampleQuery {
   }
 };
 struct NeighborSampleResult {
-  int64_t *val;
-  int64_t *actual_val;
+  uint64_t *val;
+  uint64_t *actual_val;
   int *actual_sample_size, sample_size, key_size;
   int total_sample_size;
   std::shared_ptr<memory::Allocation> val_mem, actual_sample_size_mem;
   std::shared_ptr<memory::Allocation> actual_val_mem;
-  int64_t *get_val() { return val; }
-  int64_t get_actual_val() { return (int64_t)actual_val; }
+  uint64_t *get_val() { return val; }
+  uint64_t get_actual_val() { return (uint64_t)actual_val; }
   int *get_actual_sample_size() { return actual_sample_size; }
   int get_sample_size() { return sample_size; }
   int get_key_size() { return key_size; }
@@ -167,16 +172,16 @@ struct NeighborSampleResult {
     platform::CUDADeviceGuard guard(dev_id);
     platform::CUDAPlace place = platform::CUDAPlace(dev_id);
     val_mem =
-        memory::AllocShared(place, _sample_size * _key_size * sizeof(int64_t));
-    val = (int64_t *)val_mem->ptr();
+        memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t));
+    val = (uint64_t *)val_mem->ptr();
     actual_sample_size_mem =
         memory::AllocShared(place, _key_size * sizeof(int));
     actual_sample_size = (int *)actual_sample_size_mem->ptr();
   }
   void display() {
     VLOG(0) << "in node sample result display ------------------";
-    int64_t *res = new int64_t[sample_size * key_size];
-    cudaMemcpy(res, val, sample_size * key_size * sizeof(int64_t),
+    uint64_t *res = new uint64_t[sample_size * key_size];
+    cudaMemcpy(res, val, sample_size * key_size * sizeof(uint64_t),
                cudaMemcpyDeviceToHost);
     int *ac_size = new int[key_size];
     cudaMemcpy(ac_size, actual_sample_size, key_size * sizeof(int),
@@ -185,8 +190,8 @@ struct NeighborSampleResult {
     for (int i = 0; i < key_size; i++) {
       total_sample_size += ac_size[i];
     }
-    int64_t *res2 = new int64_t[total_sample_size];  // r
-    cudaMemcpy(res2, actual_val, total_sample_size * sizeof(int64_t),
+    uint64_t *res2 = new uint64_t[total_sample_size];  // r
+    cudaMemcpy(res2, actual_val, total_sample_size * sizeof(uint64_t),
                cudaMemcpyDeviceToHost);  // r
 
     int start = 0;
@@ -208,13 +213,13 @@ struct NeighborSampleResult {
     delete[] ac_size;
     VLOG(0) << " ------------------";
   }
-  std::vector<int64_t> get_sampled_graph(NeighborSampleQuery q) {
-    std::vector<int64_t> graph;
+  std::vector<uint64_t> get_sampled_graph(NeighborSampleQuery q) {
+    std::vector<uint64_t> graph;
     int64_t *sample_keys = new int64_t[q.len];
     std::string key_str;
-    cudaMemcpy(sample_keys, q.key, q.len * sizeof(int64_t),
+    cudaMemcpy(sample_keys, q.src_nodes, q.len * sizeof(uint64_t),
                cudaMemcpyDeviceToHost);
-    int64_t *res = new int64_t[sample_size * key_size];
+    uint64_t *res = new uint64_t[sample_size * key_size];
     cudaMemcpy(res, val, sample_size * key_size * sizeof(int64_t),
                cudaMemcpyDeviceToHost);
     int *ac_size = new int[key_size];
@@ -224,8 +229,8 @@ struct NeighborSampleResult {
     for (int i = 0; i < key_size; i++) {
       total_sample_size += ac_size[i];
     }
-    int64_t *res2 = new int64_t[total_sample_size];  // r
-    cudaMemcpy(res2, actual_val, total_sample_size * sizeof(int64_t),
+    uint64_t *res2 = new uint64_t[total_sample_size];  // r
+    cudaMemcpy(res2, actual_val, total_sample_size * sizeof(uint64_t),
                cudaMemcpyDeviceToHost);  // r
 
     int start = 0;
@@ -248,24 +253,24 @@ struct NeighborSampleResult {
 };
 
 struct NodeQueryResult {
-  int64_t *val;
+  uint64_t *val;
   int actual_sample_size;
-  int64_t get_val() { return (int64_t)val; }
+  uint64_t get_val() { return (uint64_t)val; }
   int get_len() { return actual_sample_size; }
   std::shared_ptr<memory::Allocation> val_mem;
   void initialize(int query_size, int dev_id) {
     platform::CUDADeviceGuard guard(dev_id);
     platform::CUDAPlace place = platform::CUDAPlace(dev_id);
-    val_mem = memory::AllocShared(place, query_size * sizeof(int64_t));
-    val = (int64_t *)val_mem->ptr();
+    val_mem = memory::AllocShared(place, query_size * sizeof(uint64_t));
+    val = (uint64_t *)val_mem->ptr();
 
     // cudaMalloc((void **)&val, query_size * sizeof(int64_t));
     actual_sample_size = 0;
   }
   void display() {
     VLOG(0) << "in node query result display ------------------";
-    int64_t *res = new int64_t[actual_sample_size];
-    cudaMemcpy(res, val, actual_sample_size * sizeof(int64_t),
+    uint64_t *res = new uint64_t[actual_sample_size];
+    cudaMemcpy(res, val, actual_sample_size * sizeof(uint64_t),
                cudaMemcpyDeviceToHost);
 
     VLOG(0) << "actual_sample_size =" << actual_sample_size;
@@ -283,7 +288,71 @@ struct NodeQueryResult {
     actual_sample_size = 0;
   };
   ~NodeQueryResult() {}
+};  // end of struct NodeQueryResult
+
+struct GpuPsGraphFeaNode {
+  uint64_t node_id;
+  uint64_t feature_size, feature_offset;
+  // this node's feature is stored on [feature_offset,feature_offset +
+  // feature_size) of int64_t *feature_list;
 };
-}
-};
+
+struct GpuPsCommGraphFea {
+  uint64_t *feature_list;
+  uint8_t *slot_id_list;
+  GpuPsGraphFeaNode *node_list;
+  uint64_t feature_size, node_size;
+  // the size of feature array and graph_node_list array
+  GpuPsCommGraphFea()
+      : feature_list(NULL),
+        slot_id_list(NULL),
+        node_list(NULL),
+        feature_size(0),
+        node_size(0) {}
+  GpuPsCommGraphFea(uint64_t *feature_list_, uint8_t *slot_id_list_,
+                    GpuPsGraphFeaNode *node_list_, uint64_t feature_size_,
+                    uint64_t node_size_)
+      : feature_list(feature_list_),
+        slot_id_list(slot_id_list_),
+        node_list(node_list_),
+        feature_size(feature_size_),
+        node_size(node_size_) {}
+  void init_on_cpu(uint64_t feature_size, uint64_t node_size,
+                   uint32_t slot_num) {
+    PADDLE_ENFORCE_LE(slot_num, 255);
+    this->feature_size = feature_size;
+    this->node_size = node_size;
+    this->feature_list = new uint64_t[feature_size];
+    this->slot_id_list = new uint8_t[feature_size];
+    this->node_list = new GpuPsGraphFeaNode[node_size];
+  }
+  void release_on_cpu() {
+    delete[] feature_list;
+    delete[] slot_id_list;
+    delete[] node_list;
+  }
+  void display_on_cpu() {
+    VLOG(1) << "feature_size = " << feature_size;
+    VLOG(1) << "node_size = " << node_size;
+    for (uint64_t i = 0; i < feature_size; i++) {
+      VLOG(1) << "feature_list[" << i << "] = " << feature_list[i];
+    }
+    for (uint64_t i = 0; i < node_size; i++) {
+      VLOG(1) << "node_id[" << node_list[i].node_id
+              << "] feature_size = " << node_list[i].feature_size;
+      std::string str;
+      int offset = node_list[i].feature_offset;
+      for (uint64_t j = 0; j < node_list[i].feature_size; j++) {
+        if (j > 0) str += ",";
+        str += std::to_string(slot_id_list[j + offset]);
+        str += ":";
+        str += std::to_string(feature_list[j + offset]);
+      }
+      VLOG(1) << str;
+    }
+  }
+};  // end of struct GpuPsCommGraphFea
+
+}  // end of namespace framework
+}  // end of namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h
new file mode 100644
index 00000000000000..e63043e414bbe9
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+namespace paddle {
+namespace framework {
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include "paddle/fluid/platform/enforce.h"
+
+inline void debug_gpu_memory_info(const char* desc) {
+  int device_num = 0;
+  auto err = cudaGetDeviceCount(&device_num);
+  PADDLE_ENFORCE_EQ(err, cudaSuccess,
+          platform::errors::InvalidArgument("cudaGetDeviceCount failed!"));
+
+  size_t avail{0};
+  size_t total{0};
+  for (int i = 0; i < device_num; ++i) {
+    cudaSetDevice(i);
+    auto err = cudaMemGetInfo(&avail, &total);
+    PADDLE_ENFORCE_EQ(err, cudaSuccess,
+            platform::errors::InvalidArgument("cudaMemGetInfo failed!"));
+    VLOG(0) << "update gpu memory on device " << i << ", "
+            << "avail=" << avail/1024.0/1024.0/1024.0 << "g, "
+            << "total=" << total/1024.0/1024.0/1024.0 << "g, "
+            << "use_rate=" << (total-avail)/double(total) << "%, "
+            << "desc=" << desc;
+  }
+}
+
+}; // namespace framework
+}; // namespace paddle
+
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index ae57c2ebe932f8..c4231cb7beb8b0 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -23,19 +23,38 @@
 #ifdef PADDLE_WITH_HETERPS
 namespace paddle {
 namespace framework {
+enum GraphTableType { EDGE_TABLE, FEATURE_TABLE };
 class GpuPsGraphTable : public HeterComm<uint64_t, int64_t, int> {
  public:
-  GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource, int topo_aware)
+  int get_table_offset(int gpu_id, GraphTableType type, int idx) const {
+    int type_id = type;
+    return gpu_id * (graph_table_num_ + feature_table_num_) +
+           type_id * graph_table_num_ + idx;
+  }
+  GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource, int topo_aware,
+                  int graph_table_num)
       : HeterComm<uint64_t, int64_t, int>(1, resource) {
     load_factor_ = 0.25;
     rw_lock.reset(new pthread_rwlock_t());
+    this->graph_table_num_ = graph_table_num;
+    this->feature_table_num_ = 1;
     gpu_num = resource_->total_device();
     memset(global_device_map, -1, sizeof(global_device_map));
+    for (auto &table : tables_) {
+      delete table;
+      table = NULL;
+    }
+    int feature_table_num = 1;
+    tables_ = std::vector<Table *>(
+        gpu_num * (graph_table_num + feature_table_num), NULL);
     for (int i = 0; i < gpu_num; i++) {
-      gpu_graph_list.push_back(GpuPsCommGraph());
       global_device_map[resource_->dev_id(i)] = i;
-      sample_status.push_back(NULL);
-      tables_.push_back(NULL);
+      for (int j = 0; j < graph_table_num; j++) {
+        gpu_graph_list_.push_back(GpuPsCommGraph());
+      }
+      for (int j = 0; j < feature_table_num; j++) {
+        gpu_graph_fea_list_.push_back(GpuPsCommGraphFea());
+      }
     }
     cpu_table_status = -1;
     if (topo_aware) {
@@ -89,35 +108,46 @@ class GpuPsGraphTable : public HeterComm<uint64_t, int64_t, int> {
     //   end_graph_sampling();
     // }
   }
-  void build_graph_on_single_gpu(GpuPsCommGraph &g, int gpu_id);
-  void clear_graph_info(int gpu_id);
-  void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list);
+  void build_graph_on_single_gpu(GpuPsCommGraph &g, int gpu_id, int idx);
+  void build_graph_fea_on_single_gpu(GpuPsCommGraphFea &g, int gpu_id);
+  void clear_graph_info(int gpu_id, int index);
+  void clear_graph_info(int index);
+  void clear_feature_info(int gpu_id, int index);
+  void clear_feature_info(int index);
+  void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list,
+                            int idx);
+  void build_graph_fea_from_cpu(std::vector<GpuPsCommGraphFea> &cpu_node_list,
+                                int idx);
   NodeQueryResult graph_node_sample(int gpu_id, int sample_size);
   NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q,
                                                 bool cpu_switch);
-  NeighborSampleResult graph_neighbor_sample(int gpu_id, int64_t *key,
+  NeighborSampleResult graph_neighbor_sample(int gpu_id, uint64_t *key,
                                              int sample_size, int len);
-  NeighborSampleResult graph_neighbor_sample_v2(int gpu_id, int64_t *key,
-                                                int sample_size, int len,
-                                                bool cpu_query_switch);
-  void init_sample_status();
-  void free_sample_status();
-  NodeQueryResult query_node_list(int gpu_id, int start, int query_size);
-  void clear_graph_info();
+  NeighborSampleResult graph_neighbor_sample_v2(int gpu_id, int idx,
+                                                uint64_t *key, int sample_size,
+                                                int len, bool cpu_query_switch);
+
+  int get_feature_of_nodes(int gpu_id,
+                    std::shared_ptr<phi::Allocation> d_walk,
+                    std::shared_ptr<phi::Allocation> d_offset, int size, int slot_num);
+
+  NodeQueryResult query_node_list(int gpu_id, int idx, int start,
+                                  int query_size);
   void display_sample_res(void *key, void *val, int len, int sample_len);
-  void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num,
+  void move_result_to_source_gpu(int gpu_id, int gpu_num,
                                                  int sample_size, int *h_left,
                                                  int *h_right,
-                                                 int64_t *src_sample_res,
+                                                 uint64_t *src_sample_res,
                                                  int *actual_sample_size);
   int init_cpu_table(const paddle::distributed::GraphParameter &graph);
   int gpu_num;
-  std::vector<GpuPsCommGraph> gpu_graph_list;
+  int graph_table_num_, feature_table_num_;
+  std::vector<GpuPsCommGraph> gpu_graph_list_;
+  std::vector<GpuPsCommGraphFea> gpu_graph_fea_list_;
   int global_device_map[32];
-  std::vector<int *> sample_status;
   const int parallel_sample_size = 1;
   const int dim_y = 256;
-  std::shared_ptr<paddle::distributed::GraphTable> cpu_graph_table;
+  std::shared_ptr<paddle::distributed::GraphTable> cpu_graph_table_;
   std::shared_ptr<pthread_rwlock_t> rw_lock;
   mutable std::mutex mutex_;
   std::condition_variable cv_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
index 72b9cae41c0fdf..f423a33abe3499 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
@@ -32,8 +32,8 @@ sample_result is to save the neighbor sampling result, its size is len *
 sample_size;
 */
 
-__global__ void get_cpu_id_index(int64_t* key, int* actual_sample_size,
-                                 int64_t* cpu_key, int* sum, int* index,
+__global__ void get_cpu_id_index(uint64_t* key, int* actual_sample_size,
+                                 uint64_t* cpu_key, int* sum, int* index,
                                  int len) {
   CUDA_KERNEL_LOOP(i, len) {
     if (actual_sample_size[i] == -1) {
@@ -46,12 +46,12 @@ __global__ void get_cpu_id_index(int64_t* key, int* actual_sample_size,
 }
 
 __global__ void get_actual_gpu_ac(int* gpu_ac, int number_on_cpu) {
-  CUDA_KERNEL_LOOP(i, number_on_cpu) { gpu_ac[i] /= sizeof(int64_t); }
+  CUDA_KERNEL_LOOP(i, number_on_cpu) { gpu_ac[i] /= sizeof(uint64_t); }
 }
 
 template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
 __global__ void copy_buffer_ac_to_final_place(
-    int64_t* gpu_buffer, int* gpu_ac, int64_t* val, int* actual_sample_size,
+    uint64_t* gpu_buffer, int* gpu_ac, uint64_t* val, int* actual_sample_size,
     int* index, int* cumsum_gpu_ac, int number_on_cpu, int sample_size) {
   assert(blockDim.x == WARP_SIZE);
   assert(blockDim.y == BLOCK_WARPS);
@@ -68,12 +68,43 @@ __global__ void copy_buffer_ac_to_final_place(
   }
 }
 
+__global__ void get_features_kernel(GpuPsCommGraphFea graph, int64_t* node_offset_array,
+                                int* actual_size, uint64_t* feature, int slot_num, int n) {
+  int idx = blockIdx.x * blockDim.y + threadIdx.y;
+  if (idx < n) {
+    int node_offset = node_offset_array[idx];
+    int offset = idx * slot_num;
+    if (node_offset == -1) {
+      for (int k = 0; k < slot_num; ++ k) {
+        feature[offset + k] = 0;
+      }
+      actual_size[idx] = slot_num;
+      return;
+    }
+
+    GpuPsGraphFeaNode* node = &(graph.node_list[node_offset]);
+    uint64_t* feature_start = &(graph.feature_list[node->feature_offset]);
+    uint8_t* slot_id_start = &(graph.slot_id_list[node->feature_offset]);
+    int m = 0;
+    for (int k = 0; k < slot_num; ++k) {
+      if (m >= node->feature_size || k < slot_id_start[m]) {
+        feature[offset + k] = 0;
+      } else if (k == slot_id_start[m]) {
+        feature[offset + k] = feature_start[m];
+        ++m;
+      } else {
+        assert(0);
+      }
+    }
+    actual_size[idx] = slot_num;
+  }
+}
+
 template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
-__global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
-                                           int64_t* node_index,
-                                           int* actual_size, int64_t* res,
-                                           int sample_len, int n,
-                                           int default_value) {
+__global__ void neighbor_sample_kernel(GpuPsCommGraph graph,
+                                       int64_t* node_index, int* actual_size,
+                                       uint64_t* res, int sample_len, int n,
+                                       int default_value) {
   assert(blockDim.x == WARP_SIZE);
   assert(blockDim.y == BLOCK_WARPS);
 
@@ -91,7 +122,7 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
     int neighbor_len = (int)graph.node_list[node_index[i]].neighbor_size;
     int64_t data_offset = graph.node_list[node_index[i]].neighbor_offset;
     int offset = i * sample_len;
-    int64_t* data = graph.neighbor_list;
+    uint64_t* data = graph.neighbor_list;
     if (neighbor_len <= sample_len) {
       for (int j = threadIdx.x; j < neighbor_len; j += WARP_SIZE) {
         res[offset + j] = data[data_offset + j];
@@ -120,85 +151,10 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
   }
 }
 
-__global__ void neighbor_sample_example(GpuPsCommGraph graph,
-                                        int64_t* node_index, int* actual_size,
-                                        int64_t* res, int sample_len,
-                                        int* sample_status, int n, int from) {
-  int id = blockIdx.x * blockDim.y + threadIdx.y;
-  if (id < n) {
-    if (node_index[id] == -1) {
-      actual_size[id] = 0;
-      return;
-    }
-    curandState rng;
-    curand_init(blockIdx.x, threadIdx.x, threadIdx.y, &rng);
-    int64_t index = threadIdx.x;
-    int64_t offset = id * sample_len;
-    int64_t* data = graph.neighbor_list;
-    int64_t data_offset = graph.node_list[node_index[id]].neighbor_offset;
-    int64_t neighbor_len = graph.node_list[node_index[id]].neighbor_size;
-    int ac_len;
-    if (sample_len > neighbor_len)
-      ac_len = neighbor_len;
-    else {
-      ac_len = sample_len;
-    }
-    if (4 * ac_len >= 3 * neighbor_len) {
-      if (index == 0) {
-        res[offset] = curand(&rng) % (neighbor_len - ac_len + 1);
-      }
-      __syncwarp();
-      int start = res[offset];
-      while (index < ac_len) {
-        res[offset + index] = data[data_offset + start + index];
-        index += blockDim.x;
-      }
-      actual_size[id] = ac_len;
-    } else {
-      while (index < ac_len) {
-        int num = curand(&rng) % neighbor_len;
-        int* addr = sample_status + data_offset + num;
-        int expected = *addr;
-        if (!(expected & (1 << from))) {
-          int old = atomicCAS(addr, expected, expected | (1 << from));
-          if (old == expected) {
-            res[offset + index] = num;
-            index += blockDim.x;
-          }
-        }
-      }
-      __syncwarp();
-      index = threadIdx.x;
-      while (index < ac_len) {
-        int* addr = sample_status + data_offset + res[offset + index];
-        int expected, old = *addr;
-        do {
-          expected = old;
-          old = atomicCAS(addr, expected, expected & (~(1 << from)));
-        } while (old != expected);
-        res[offset + index] = data[data_offset + res[offset + index]];
-        index += blockDim.x;
-      }
-      actual_size[id] = ac_len;
-    }
-  }
-  // const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  // if (i < n) {
-  //   auto node_index = index[i];
-  //   actual_size[i] = graph.node_list[node_index].neighbor_size < sample_size
-  //                        ? graph.node_list[node_index].neighbor_size
-  //                        : sample_size;
-  //   int offset = graph.node_list[node_index].neighbor_offset;
-  //   for (int j = 0; j < actual_size[i]; j++) {
-  //     sample_result[sample_size * i + j] = graph.neighbor_list[offset + j];
-  //   }
-  // }
-}
-
 int GpuPsGraphTable::init_cpu_table(
     const paddle::distributed::GraphParameter& graph) {
-  cpu_graph_table.reset(new paddle::distributed::GraphTable);
-  cpu_table_status = cpu_graph_table->Initialize(graph);
+  cpu_graph_table_.reset(new paddle::distributed::GraphTable);
+  cpu_table_status = cpu_graph_table_->Initialize(graph);
   // if (cpu_table_status != 0) return cpu_table_status;
   // std::function<void(std::vector<GpuPsCommGraph>&)> callback =
   //     [this](std::vector<GpuPsCommGraph>& res) {
@@ -212,17 +168,6 @@ int GpuPsGraphTable::init_cpu_table(
   return cpu_table_status;
 }
 
-// int GpuPsGraphTable::load(const std::string& path, const std::string& param)
-// {
-//   int status = cpu_graph_table->load(path, param);
-//   if (status != 0) {
-//     return status;
-//   }
-//   std::unique_lock<std::mutex> lock(mutex_);
-//   cpu_graph_table->start_graph_sampling();
-//   cv_.wait(lock);
-//   return 0;
-// }
 /*
  comment 1
  gpu i triggers a neighbor_sample task,
@@ -246,30 +191,32 @@ int GpuPsGraphTable::init_cpu_table(
 
 void GpuPsGraphTable::display_sample_res(void* key, void* val, int len,
                                          int sample_len) {
-  char key_buffer[len * sizeof(int64_t)];
+  char key_buffer[len * sizeof(uint64_t)];
   char val_buffer[sample_len * sizeof(int64_t) * len +
-                  (len + len % 2) * sizeof(int) + len * sizeof(int64_t)];
-  cudaMemcpy(key_buffer, key, sizeof(int64_t) * len, cudaMemcpyDeviceToHost);
+                  (len + len % 2) * sizeof(int) + len * sizeof(uint64_t)];
+  cudaMemcpy(key_buffer, key, sizeof(uint64_t) * len, cudaMemcpyDeviceToHost);
   cudaMemcpy(val_buffer, val,
              sample_len * sizeof(int64_t) * len +
-                 (len + len % 2) * sizeof(int) + len * sizeof(int64_t),
+                 (len + len % 2) * sizeof(int) + len * sizeof(uint64_t),
              cudaMemcpyDeviceToHost);
-  int64_t* sample_val = (int64_t*)(val_buffer + (len + len % 2) * sizeof(int) +
-                                   len * sizeof(int64_t));
+  uint64_t* sample_val =
+      (uint64_t*)(val_buffer + (len + len % 2) * sizeof(int) +
+                  len * sizeof(int64_t));
   for (int i = 0; i < len; i++) {
-    printf("key %lld\n", *(int64_t*)(key_buffer + i * sizeof(int64_t)));
-    printf("index %lld\n", *(int64_t*)(val_buffer + i * sizeof(int64_t)));
+    printf("key %llu\n", *(int64_t*)(key_buffer + i * sizeof(uint64_t)));
+    printf("index %llu\n", *(int64_t*)(val_buffer + i * sizeof(uint64_t)));
     int ac_size = *(int*)(val_buffer + i * sizeof(int) + len * sizeof(int64_t));
     printf("sampled %d neigbhors\n", ac_size);
     for (int j = 0; j < ac_size; j++) {
-      printf("%lld ", sample_val[i * sample_len + j]);
+      printf("%llu ", sample_val[i * sample_len + j]);
     }
     printf("\n");
   }
 }
-void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
+
+void GpuPsGraphTable::move_result_to_source_gpu(
     int start_index, int gpu_num, int sample_size, int* h_left, int* h_right,
-    int64_t* src_sample_res, int* actual_sample_size) {
+    uint64_t* src_sample_res, int* actual_sample_size) {
   int shard_len[gpu_num];
   for (int i = 0; i < gpu_num; i++) {
     if (h_left[i] == -1 || h_right[i] == -1) {
@@ -289,7 +236,7 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
         reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
         node.val_storage + sizeof(int64_t) * shard_len[i] +
             sizeof(int) * (shard_len[i] + shard_len[i] % 2),
-        sizeof(int64_t) * shard_len[i] * sample_size, cudaMemcpyDefault,
+        sizeof(uint64_t) * shard_len[i] * sample_size, cudaMemcpyDefault,
         node.out_stream);
     cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
                     node.val_storage + sizeof(int64_t) * shard_len[i],
@@ -304,115 +251,13 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
     cudaStreamSynchronize(node.out_stream);
     // cudaStreamSynchronize(resource_->remote_stream(i, start_index));
   }
-  /*
-    std::queue<CopyTask> que;
-    // auto& node = path_[gpu_id][i].nodes_.front();
-    // cudaMemcpyAsync(
-    //     reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
-    //     node.val_storage + sizeof(int64_t) * shard_len,
-    //     node.val_bytes_len - sizeof(int64_t) * shard_len, cudaMemcpyDefault,
-    //     node.out_stream);
-    // cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
-    //                 node.val_storage + sizeof(int) * shard_len,
-    //                 sizeof(int) * shard_len, cudaMemcpyDefault,
-    //                 node.out_stream);
-    int cur_step = path_[start_index][i].nodes_.size() - 1;
-    auto& node = path_[start_index][i].nodes_[cur_step];
-    if (cur_step == 0) {
-      // cudaMemcpyAsync(reinterpret_cast<char*>(src_val + h_left[i]),
-      //                 node.val_storage, node.val_bytes_len,
-      //                 cudaMemcpyDefault,
-      //                 node.out_stream);
-     // VLOG(0)<<"copy "<<node.gpu_num<<" to "<<start_index;
-      cudaMemcpyAsync(
-          reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
-          node.val_storage + sizeof(int64_t) * shard_len[i],
-          node.val_bytes_len - sizeof(int64_t) * shard_len[i],
-          cudaMemcpyDefault,
-          node.out_stream);
-          //resource_->remote_stream(i, start_index));
-      cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
-                      node.val_storage + sizeof(int) * shard_len[i],
-                      sizeof(int) * shard_len[i], cudaMemcpyDefault,
-                      node.out_stream);
-                      //resource_->remote_stream(i, start_index));
-    } else {
-      CopyTask t(&path_[start_index][i], cur_step - 1);
-      que.push(t);
-       //     VLOG(0)<<"copy "<<node.gpu_num<<" to
-  "<<path_[start_index][i].nodes_[cur_step - 1].gpu_num;
-      cudaMemcpyAsync(path_[start_index][i].nodes_[cur_step - 1].val_storage,
-                      node.val_storage,
-                      path_[start_index][i].nodes_[cur_step - 1].val_bytes_len,
-                      cudaMemcpyDefault,
-                     path_[start_index][i].nodes_[cur_step - 1].out_stream);
-                     //resource_->remote_stream(i, start_index));
-    }
-  }
-  while (!que.empty()) {
-    CopyTask& cur_task = que.front();
-    que.pop();
-    int cur_step = cur_task.step;
-    if (cur_task.path->nodes_[cur_step].sync) {
-      cudaStreamSynchronize(cur_task.path->nodes_[cur_step].out_stream);
-      //cudaStreamSynchronize(resource_->remote_stream(cur_task.path->nodes_.back().gpu_num,
-  start_index));
-    }
-    if (cur_step > 0) {
-      CopyTask c(cur_task.path, cur_step - 1);
-      que.push(c);
-      cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage,
-                      cur_task.path->nodes_[cur_step].val_storage,
-                      cur_task.path->nodes_[cur_step - 1].val_bytes_len,
-                      cudaMemcpyDefault,
-                      cur_task.path->nodes_[cur_step - 1].out_stream);
-                      //resource_->remote_stream(cur_task.path->nodes_.back().gpu_num,
-  start_index));
-    } else if (cur_step == 0) {
-      int end_index = cur_task.path->nodes_.back().gpu_num;
-      // cudaMemcpyAsync(reinterpret_cast<char*>(src_val + h_left[end_index]),
-      //                 cur_task.path->nodes_[cur_step].val_storage,
-      //                 cur_task.path->nodes_[cur_step].val_bytes_len,
-      //                 cudaMemcpyDefault,
-      //                 cur_task.path->nodes_[cur_step].out_stream);
-      //VLOG(0)<<"copy "<<cur_task.path->nodes_[cur_step].gpu_num<< " to
-  "<<start_index;
-      cudaMemcpyAsync(reinterpret_cast<char*>(src_sample_res +
-                                              h_left[end_index] * sample_size),
-                      cur_task.path->nodes_[cur_step].val_storage +
-                          sizeof(int64_t) * shard_len[end_index],
-                      cur_task.path->nodes_[cur_step].val_bytes_len -
-                          sizeof(int64_t) * shard_len[end_index],
-                      cudaMemcpyDefault,
-                      cur_task.path->nodes_[cur_step].out_stream);
-                      //resource_->remote_stream(cur_task.path->nodes_.back().gpu_num,
-  start_index));
-      cudaMemcpyAsync(
-          reinterpret_cast<char*>(actual_sample_size + h_left[end_index]),
-          cur_task.path->nodes_[cur_step].val_storage +
-              sizeof(int) * shard_len[end_index],
-          sizeof(int) * shard_len[end_index], cudaMemcpyDefault,
-          cur_task.path->nodes_[cur_step].out_stream);
-          //resource_->remote_stream(cur_task.path->nodes_.back().gpu_num,
-  start_index));
-    }
-  }
-  for (int i = 0; i < gpu_num; ++i) {
-    if (h_left[i] == -1 || h_right[i] == -1) {
-      continue;
-    }
-    auto& node = path_[start_index][i].nodes_.front();
-    cudaStreamSynchronize(node.out_stream);
-    //cudaStreamSynchronize(resource_->remote_stream(i, start_index));
-  }
-  */
 }
 
 /*
 TODO:
 how to optimize it to eliminate the for loop
 */
-__global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals,
+__global__ void fill_dvalues(uint64_t* d_shard_vals, uint64_t* d_vals,
                              int* d_shard_actual_sample_size,
                              int* d_actual_sample_size, int* idx,
                              int sample_size, int len) {
@@ -425,7 +270,18 @@ __global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals,
   }
 }
 
-__global__ void fill_actual_vals(int64_t* vals, int64_t* actual_vals,
+__global__ void fill_dvalues(uint64_t* d_shard_vals, uint64_t* d_vals,
+                             int* d_shard_actual_sample_size,
+                             int* idx, int sample_size, int len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    for (int j = 0; j < sample_size; j++) {
+      d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j];
+    }
+  }
+}
+
+__global__ void fill_actual_vals(uint64_t* vals, uint64_t* actual_vals,
                                  int* actual_sample_size,
                                  int* cumsum_actual_sample_size,
                                  int sample_size, int len) {
@@ -438,18 +294,48 @@ __global__ void fill_actual_vals(int64_t* vals, int64_t* actual_vals,
 }
 
 __global__ void node_query_example(GpuPsCommGraph graph, int start, int size,
-                                   int64_t* res) {
+                                   uint64_t* res) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < size) {
     res[i] = graph.node_list[start + i].node_id;
   }
 }
 
-void GpuPsGraphTable::clear_graph_info(int gpu_id) {
-  if (tables_.size() && tables_[gpu_id] != NULL) {
-    delete tables_[gpu_id];
+void GpuPsGraphTable::clear_feature_info(int gpu_id) {
+  int idx = 0;
+  if (idx >= feature_table_num_) return;
+  int offset = get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, idx);
+  if (offset < tables_.size()) {
+    delete tables_[offset];
+    tables_[offset] = NULL;
+  }
+
+  int graph_fea_idx = gpu_id * feature_table_num_ + idx;
+  if (graph_fea_idx >= gpu_graph_fea_list_.size()) {
+    return;
+  }
+  auto& graph = gpu_graph_fea_list_[graph_fea_idx];
+  if (graph.feature_list != NULL) {
+    cudaFree(graph.feature_list);
+  }
+
+  if (graph.slot_id_list != NULL) {
+    cudaFree(graph.slot_id_list);
+  }
+
+  if (graph.node_list != NULL) {
+    cudaFree(graph.node_list);
+  }
+}
+
+void GpuPsGraphTable::clear_graph_info(int gpu_id, int idx) {
+  if (idx >= graph_table_num_) return;
+  int offset = get_table_offset(gpu_id, GraphTableType::EDGE_TABLE, idx);
+  if (offset < tables_.size()) {
+    delete tables_[offset];
+    tables_[offset] = NULL;
   }
-  auto& graph = gpu_graph_list[gpu_id];
+  auto& graph = gpu_graph_list_[gpu_id * graph_table_num_ + idx];
   if (graph.neighbor_list != NULL) {
     cudaFree(graph.neighbor_list);
   }
@@ -457,21 +343,88 @@ void GpuPsGraphTable::clear_graph_info(int gpu_id) {
     cudaFree(graph.node_list);
   }
 }
-void GpuPsGraphTable::clear_graph_info() {
-  if (tables_.size()) {
-    for (auto table : tables_) delete table;
-  }
-  tables_.clear();
-  for (auto graph : gpu_graph_list) {
-    if (graph.neighbor_list != NULL) {
-      cudaFree(graph.neighbor_list);
-    }
-    if (graph.node_list != NULL) {
-      cudaFree(graph.node_list);
+void GpuPsGraphTable::clear_graph_info(int idx) {
+  for (int i = 0; i < gpu_num; i++) clear_graph_info(i, idx);
+}
+/*
+the parameter std::vector<GpuPsCommGraph> cpu_graph_list is generated by cpu.
+it saves the graph to be saved on each gpu.
+for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number
+== i
+In this function, memory is allocated on each gpu to save the graphs,
+gpu i saves the ith graph from cpu_graph_list
+*/
+void GpuPsGraphTable::build_graph_fea_on_single_gpu(GpuPsCommGraphFea& g,
+                                                    int gpu_id) {
+  clear_feature_info(gpu_id);
+  int ntype_id = 0;
+
+  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
+
+  int offset = gpu_id * feature_table_num_ + ntype_id;
+  gpu_graph_fea_list_[offset] = GpuPsCommGraphFea();
+
+  int table_offset =
+      get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, ntype_id);
+
+  size_t capacity = std::max((uint64_t)1, g.node_size) / load_factor_;
+  tables_[table_offset] = new Table(capacity);
+  if (g.node_size > 0) {
+    std::vector<uint64_t> keys;
+    std::vector<int64_t> offsets;
+    // TODO
+    cudaMalloc((void**)&gpu_graph_fea_list_[offset].node_list,
+               g.node_size * sizeof(GpuPsGraphFeaNode));
+    cudaMemcpy(gpu_graph_fea_list_[offset].node_list, g.node_list,
+               g.node_size * sizeof(GpuPsGraphFeaNode), cudaMemcpyHostToDevice);
+    for (int64_t j = 0; j < g.node_size; j++) {
+      keys.push_back(g.node_list[j].node_id);
+      offsets.push_back(j);
     }
+    build_ps(gpu_id, keys.data(), offsets.data(), keys.size(), 1024, 8,
+             table_offset);
+    gpu_graph_fea_list_[offset].node_size = g.node_size;
+  } else {
+    build_ps(gpu_id, NULL, NULL, 0, 1024, 8, table_offset);
+    gpu_graph_fea_list_[offset].node_list = NULL;
+    gpu_graph_fea_list_[offset].node_size = 0;
+  }
+  if (g.feature_size) {
+    // TODO
+    cudaError_t cudaStatus =
+        cudaMalloc((void**)&gpu_graph_fea_list_[offset].feature_list,
+                   g.feature_size * sizeof(uint64_t));
+    PADDLE_ENFORCE_EQ(
+        cudaStatus, cudaSuccess,
+        platform::errors::InvalidArgument(
+            "ailed to allocate memory for graph-feature on gpu "));
+    VLOG(0) << "sucessfully allocate " << g.feature_size * sizeof(uint64_t)
+            << " bytes of memory for graph-feature on gpu "
+            << resource_->dev_id(gpu_id);
+    cudaMemcpy(gpu_graph_fea_list_[offset].feature_list, g.feature_list,
+               g.feature_size * sizeof(uint64_t), cudaMemcpyHostToDevice);
+
+    // TODO
+    cudaStatus = cudaMalloc((void**)&gpu_graph_fea_list_[offset].slot_id_list,
+                            g.feature_size * sizeof(uint8_t));
+    PADDLE_ENFORCE_EQ(
+        cudaStatus, cudaSuccess,
+        platform::errors::InvalidArgument(
+            "ailed to allocate memory for graph-feature on gpu "));
+    VLOG(0) << "sucessfully allocate " << g.feature_size * sizeof(uint8_t)
+            << " bytes of memory for graph-feature on gpu "
+            << resource_->dev_id(gpu_id);
+    cudaMemcpy(gpu_graph_fea_list_[offset].slot_id_list, g.slot_id_list,
+               g.feature_size * sizeof(uint8_t), cudaMemcpyHostToDevice);
+
+    gpu_graph_fea_list_[offset].feature_size = g.feature_size;
+  } else {
+    gpu_graph_fea_list_[offset].feature_list = NULL;
+    gpu_graph_fea_list_[offset].slot_id_list = NULL;
+    gpu_graph_fea_list_[offset].feature_size = 0;
   }
-  gpu_graph_list.clear();
 }
+
 /*
 the parameter std::vector<GpuPsCommGraph> cpu_graph_list is generated by cpu.
 it saves the graph to be saved on each gpu.
@@ -480,118 +433,170 @@ for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number
 In this function, memory is allocated on each gpu to save the graphs,
 gpu i saves the ith graph from cpu_graph_list
 */
-
-void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i) {
-  clear_graph_info(i);
+void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i,
+                                                int idx) {
+  clear_graph_info(i, idx);
   platform::CUDADeviceGuard guard(resource_->dev_id(i));
-  // platform::CUDADeviceGuard guard(i);
-  gpu_graph_list[i] = GpuPsCommGraph();
-  sample_status[i] = NULL;
-  tables_[i] = new Table(std::max((int64_t)1, g.node_size) / load_factor_);
+  int offset = i * graph_table_num_ + idx;
+  gpu_graph_list_[offset] = GpuPsCommGraph();
+  int table_offset = get_table_offset(i, GraphTableType::EDGE_TABLE, idx);
+  size_t capacity = std::max((uint64_t)1, (uint64_t)g.node_size) / load_factor_;
+  tables_[table_offset] = new Table(capacity);
   if (g.node_size > 0) {
-    std::vector<int64_t> keys;
-    std::vector<int64_t> offset;
-    cudaMalloc((void**)&gpu_graph_list[i].node_list,
+    std::vector<uint64_t> keys;
+    std::vector<int64_t> offsets;
+    cudaMalloc((void**)&gpu_graph_list_[offset].node_list,
                g.node_size * sizeof(GpuPsGraphNode));
-    cudaMemcpy(gpu_graph_list[i].node_list, g.node_list,
+    cudaMemcpy(gpu_graph_list_[offset].node_list, g.node_list,
                g.node_size * sizeof(GpuPsGraphNode), cudaMemcpyHostToDevice);
     for (int64_t j = 0; j < g.node_size; j++) {
       keys.push_back(g.node_list[j].node_id);
-      offset.push_back(j);
+      offsets.push_back(j);
     }
-    build_ps(i, (uint64_t*)keys.data(), offset.data(), keys.size(), 1024, 8);
-    gpu_graph_list[i].node_size = g.node_size;
+    build_ps(i, (uint64_t*)keys.data(), offsets.data(), keys.size(), 1024, 8,
+             table_offset);
+    gpu_graph_list_[offset].node_size = g.node_size;
   } else {
-    build_ps(i, NULL, NULL, 0, 1024, 8);
-    gpu_graph_list[i].node_list = NULL;
-    gpu_graph_list[i].node_size = 0;
+    build_ps(i, NULL, NULL, 0, 1024, 8, table_offset);
+    gpu_graph_list_[offset].node_list = NULL;
+    gpu_graph_list_[offset].node_size = 0;
   }
   if (g.neighbor_size) {
     cudaError_t cudaStatus =
-        cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
-                   g.neighbor_size * sizeof(int64_t));
+        cudaMalloc((void**)&gpu_graph_list_[offset].neighbor_list,
+                   g.neighbor_size * sizeof(uint64_t));
     PADDLE_ENFORCE_EQ(cudaStatus, cudaSuccess,
                       platform::errors::InvalidArgument(
                           "ailed to allocate memory for graph on gpu "));
-    VLOG(0) << "sucessfully allocate " << g.neighbor_size * sizeof(int64_t)
+    VLOG(0) << "sucessfully allocate " << g.neighbor_size * sizeof(uint64_t)
             << " bytes of memory for graph-edges on gpu "
             << resource_->dev_id(i);
-    cudaMemcpy(gpu_graph_list[i].neighbor_list, g.neighbor_list,
-               g.neighbor_size * sizeof(int64_t), cudaMemcpyHostToDevice);
-    gpu_graph_list[i].neighbor_size = g.neighbor_size;
+    cudaMemcpy(gpu_graph_list_[offset].neighbor_list, g.neighbor_list,
+               g.neighbor_size * sizeof(uint64_t), cudaMemcpyHostToDevice);
+    gpu_graph_list_[offset].neighbor_size = g.neighbor_size;
   } else {
-    gpu_graph_list[i].neighbor_list = NULL;
-    gpu_graph_list[i].neighbor_size = 0;
+    gpu_graph_list_[offset].neighbor_list = NULL;
+    gpu_graph_list_[offset].neighbor_size = 0;
   }
 }
 
-void GpuPsGraphTable::init_sample_status() {
-  for (int i = 0; i < gpu_num; i++) {
-    if (gpu_graph_list[i].neighbor_size) {
-      platform::CUDADeviceGuard guard(resource_->dev_id(i));
-      int* addr;
-      cudaMalloc((void**)&addr, gpu_graph_list[i].neighbor_size * sizeof(int));
-      cudaMemset(addr, 0, gpu_graph_list[i].neighbor_size * sizeof(int));
-      sample_status[i] = addr;
-    }
-  }
-}
+void GpuPsGraphTable::build_graph_fea_from_cpu(
+    std::vector<GpuPsCommGraphFea>& cpu_graph_fea_list, int ntype_id) {
+  PADDLE_ENFORCE_EQ(
+      cpu_graph_fea_list.size(), resource_->total_device(),
+      platform::errors::InvalidArgument("the cpu node list size doesn't match "
+                                        "the number of gpu on your machine."));
+  clear_feature_info(ntype_id);
+  for (int i = 0; i < cpu_graph_fea_list.size(); i++) {
+    int table_offset =
+        get_table_offset(i, GraphTableType::FEATURE_TABLE, ntype_id);
+    int offset = i * feature_table_num_ + ntype_id;
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    gpu_graph_fea_list_[offset] = GpuPsCommGraphFea();
+    tables_[table_offset] = new Table(
+        std::max((uint64_t)1, (uint64_t)cpu_graph_fea_list[i].node_size) /
+        load_factor_);
+    if (cpu_graph_fea_list[i].node_size > 0) {
+      std::vector<uint64_t> keys;
+      std::vector<int64_t> offsets;
+      // TODO
+      cudaMalloc((void**)&gpu_graph_fea_list_[offset].node_list,
+                 cpu_graph_fea_list[i].node_size * sizeof(GpuPsGraphNode));
+      cudaMemcpy(gpu_graph_fea_list_[offset].node_list,
+                 cpu_graph_fea_list[i].node_list,
+                 cpu_graph_fea_list[i].node_size * sizeof(GpuPsGraphNode),
+                 cudaMemcpyHostToDevice);
+      for (int64_t j = 0; j < cpu_graph_fea_list[i].node_size; j++) {
+        keys.push_back(cpu_graph_fea_list[i].node_list[j].node_id);
+        offsets.push_back(j);
+      }
+      build_ps(i, (uint64_t*)(keys.data()), offsets.data(), keys.size(), 1024,
+               8, table_offset);
+      gpu_graph_fea_list_[offset].node_size = cpu_graph_fea_list[i].node_size;
+    } else {
+      build_ps(i, NULL, NULL, 0, 1024, 8, table_offset);
+      gpu_graph_fea_list_[offset].node_list = NULL;
+      gpu_graph_fea_list_[offset].node_size = 0;
+    }
+    if (cpu_graph_fea_list[i].feature_size) {
+      // TODO
+      cudaMalloc((void**)&gpu_graph_fea_list_[offset].feature_list,
+                 cpu_graph_fea_list[i].feature_size * sizeof(uint64_t));
+
+      cudaMemcpy(gpu_graph_fea_list_[offset].feature_list,
+                 cpu_graph_fea_list[i].feature_list,
+                 cpu_graph_fea_list[i].feature_size * sizeof(uint64_t),
+                 cudaMemcpyHostToDevice);
 
-void GpuPsGraphTable::free_sample_status() {
-  for (int i = 0; i < gpu_num; i++) {
-    if (sample_status[i] != NULL) {
-      platform::CUDADeviceGuard guard(resource_->dev_id(i));
-      cudaFree(sample_status[i]);
+      // TODO
+      cudaMalloc((void**)&gpu_graph_fea_list_[offset].slot_id_list,
+                 cpu_graph_fea_list[i].feature_size * sizeof(uint8_t));
+
+      cudaMemcpy(gpu_graph_fea_list_[offset].slot_id_list,
+                 cpu_graph_fea_list[i].slot_id_list,
+                 cpu_graph_fea_list[i].feature_size * sizeof(uint8_t),
+                 cudaMemcpyHostToDevice);
+
+      gpu_graph_fea_list_[offset].feature_size =
+          cpu_graph_fea_list[i].feature_size;
+    } else {
+      gpu_graph_fea_list_[offset].feature_list = NULL;
+      gpu_graph_fea_list_[offset].slot_id_list = NULL;
+      gpu_graph_fea_list_[offset].feature_size = 0;
     }
   }
+  cudaDeviceSynchronize();
 }
+
 void GpuPsGraphTable::build_graph_from_cpu(
-    std::vector<GpuPsCommGraph>& cpu_graph_list) {
+    std::vector<GpuPsCommGraph>& cpu_graph_list, int idx) {
   VLOG(0) << "in build_graph_from_cpu cpu_graph_list size = "
           << cpu_graph_list.size();
   PADDLE_ENFORCE_EQ(
       cpu_graph_list.size(), resource_->total_device(),
       platform::errors::InvalidArgument("the cpu node list size doesn't match "
                                         "the number of gpu on your machine."));
-  clear_graph_info();
+  clear_graph_info(idx);
   for (int i = 0; i < cpu_graph_list.size(); i++) {
+    int table_offset = get_table_offset(i, GraphTableType::EDGE_TABLE, idx);
+    int offset = i * graph_table_num_ + idx;
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
-    gpu_graph_list[i] = GpuPsCommGraph();
-    sample_status[i] = NULL;
-    tables_[i] = new Table(std::max((int64_t)1, cpu_graph_list[i].node_size) /
-                           load_factor_);
+    gpu_graph_list_[offset] = GpuPsCommGraph();
+    tables_[table_offset] =
+        new Table(std::max((uint64_t)1, (uint64_t)cpu_graph_list[i].node_size) /
+                  load_factor_);
     if (cpu_graph_list[i].node_size > 0) {
-      std::vector<int64_t> keys;
-      std::vector<int64_t> offset;
-      cudaMalloc((void**)&gpu_graph_list[i].node_list,
+      std::vector<uint64_t> keys;
+      std::vector<int64_t> offsets;
+      cudaMalloc((void**)&gpu_graph_list_[offset].node_list,
                  cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode));
-      cudaMemcpy(gpu_graph_list[i].node_list, cpu_graph_list[i].node_list,
+      cudaMemcpy(gpu_graph_list_[offset].node_list, cpu_graph_list[i].node_list,
                  cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode),
                  cudaMemcpyHostToDevice);
       for (int64_t j = 0; j < cpu_graph_list[i].node_size; j++) {
         keys.push_back(cpu_graph_list[i].node_list[j].node_id);
-        offset.push_back(j);
+        offsets.push_back(j);
       }
-      build_ps(i, (uint64_t*)(keys.data()), offset.data(), keys.size(), 1024,
-               8);
-      gpu_graph_list[i].node_size = cpu_graph_list[i].node_size;
+      build_ps(i, (uint64_t*)(keys.data()), offsets.data(), keys.size(), 1024,
+               8, table_offset);
+      gpu_graph_list_[offset].node_size = cpu_graph_list[i].node_size;
     } else {
-      build_ps(i, NULL, NULL, 0, 1024, 8);
-      gpu_graph_list[i].node_list = NULL;
-      gpu_graph_list[i].node_size = 0;
+      build_ps(i, NULL, NULL, 0, 1024, 8, table_offset);
+      gpu_graph_list_[offset].node_list = NULL;
+      gpu_graph_list_[offset].node_size = 0;
     }
     if (cpu_graph_list[i].neighbor_size) {
-      cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
-                 cpu_graph_list[i].neighbor_size * sizeof(int64_t));
+      cudaMalloc((void**)&gpu_graph_list_[offset].neighbor_list,
+                 cpu_graph_list[i].neighbor_size * sizeof(uint64_t));
 
-      cudaMemcpy(gpu_graph_list[i].neighbor_list,
+      cudaMemcpy(gpu_graph_list_[offset].neighbor_list,
                  cpu_graph_list[i].neighbor_list,
-                 cpu_graph_list[i].neighbor_size * sizeof(int64_t),
+                 cpu_graph_list[i].neighbor_size * sizeof(uint64_t),
                  cudaMemcpyHostToDevice);
-      gpu_graph_list[i].neighbor_size = cpu_graph_list[i].neighbor_size;
+      gpu_graph_list_[offset].neighbor_size = cpu_graph_list[i].neighbor_size;
     } else {
-      gpu_graph_list[i].neighbor_list = NULL;
-      gpu_graph_list[i].neighbor_size = 0;
+      gpu_graph_list_[offset].neighbor_list = NULL;
+      gpu_graph_list_[offset].neighbor_size = 0;
     }
   }
   cudaDeviceSynchronize();
@@ -599,174 +604,21 @@ void GpuPsGraphTable::build_graph_from_cpu(
 
 NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v3(
     NeighborSampleQuery q, bool cpu_switch) {
-  return graph_neighbor_sample_v2(global_device_map[q.gpu_id], q.key,
-                                  q.sample_size, q.len, cpu_switch);
+  return graph_neighbor_sample_v2(global_device_map[q.gpu_id], q.table_idx,
+                                  q.src_nodes, q.sample_size, q.len,
+                                  cpu_switch);
 }
+
 NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
-                                                            int64_t* key,
+                                                            uint64_t* key,
                                                             int sample_size,
                                                             int len) {
-  /*
- comment 2
-  this function shares some kernels with heter_comm_inl.h
-  arguments definitions:
-  gpu_id:the id of gpu.
-  len:how many keys are used,(the length of array key)
-  sample_size:how many neighbors should be sampled for each node in key.
-  the code below shuffle the key array to make the keys
-    that belong to a gpu-card stay together,
-    the shuffled result is saved on d_shard_keys,
-    if ith element in d_shard_keys_ptr is
-    from jth element in the original key array, then idx[i] = j,
-    idx could be used to recover the original array.
-    if keys in range [a,b] belong to ith-gpu, then h_left[i] = a, h_right[i] =
- b,
-    if no keys are allocated for ith-gpu, then h_left[i] == h_right[i] == -1
-    for example, suppose key = [0,1,2,3,4,5,6,7,8], gpu_num = 2
-    when we run this neighbor_sample function,
-    the key is shuffled to [0,2,4,6,8,1,3,5,7]
-    the first part (0,2,4,6,8) % 2 == 0,thus should be handled by gpu 0,
-    the rest part should be handled by gpu1, because (1,3,5,7) % 2 == 1,
-    h_left = [0,5],h_right = [4,8]
-  */
-
-  NeighborSampleResult result;
-  result.initialize(sample_size, len, resource_->dev_id(gpu_id));
-  if (len == 0) {
-    return result;
-  }
-  platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
-  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
-  int* actual_sample_size = result.actual_sample_size;
-  int64_t* val = result.val;
-  int total_gpu = resource_->total_device();
-  auto stream = resource_->local_stream(gpu_id, 0);
-
-  int grid_size = (len - 1) / block_size_ + 1;
-
-  int h_left[total_gpu];   // NOLINT
-  int h_right[total_gpu];  // NOLINT
-
-  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
-  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
-  int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
-  int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
-
-  cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
-  cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
-  //
-  auto d_idx = memory::Alloc(place, len * sizeof(int));
-  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
-
-  auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t));
-  int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr());
-  auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t));
-  int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
-  auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
-  int* d_shard_actual_sample_size_ptr =
-      reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
-
-  split_input_to_shard((uint64_t*)(key), d_idx_ptr, len, d_left_ptr,
-                       d_right_ptr, gpu_id);
-
-  heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, key, d_idx_ptr, len,
-                                     stream);
-  cudaStreamSynchronize(stream);
-
-  cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int),
-             cudaMemcpyDeviceToHost);
-  cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int),
-             cudaMemcpyDeviceToHost);
-  // auto start1 = std::chrono::steady_clock::now();
-  for (int i = 0; i < total_gpu; ++i) {
-    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
-    if (shard_len == 0) {
-      continue;
-    }
-    /*
-   comment 3
-    shard_len denotes the size of keys on i-th gpu here,
-    when we sample  on i-th gpu, we allocate shard_len * (1 + sample_size)
-   int64_t units
-    of memory, we use alloc_mem_i to denote it, the range [0,shard_len) is saved
-   for the respective nodes' indexes
-    and acutal sample_size.
-    with nodes' indexes we could get the nodes to sample.
-    since size of int64_t is 8 bits, while size of int is 4,
-    the range of [0,shard_len) contains shard_len * 2 int uinits;
-    The values of the first half of this range will be updated by
-    the k-v map on i-th-gpu.
-    The second half of this range is saved for actual sample size of each node.
-    For node x,
-    its sampling result is saved on the range
-    [shard_len + sample_size * x,shard_len + sample_size * x +
-   actual_sample_size_of_x)
-    of alloc_mem_i, actual_sample_size_of_x equals ((int
-   *)alloc_mem_i)[shard_len + x]
-    */
-
-    create_storage(gpu_id, i, shard_len * sizeof(int64_t),
-                   shard_len * (1 + sample_size) * sizeof(int64_t) +
-                       sizeof(int) * (shard_len + shard_len % 2));
-    // auto& node = path_[gpu_id][i].nodes_[0];
-  }
-  walk_to_dest(gpu_id, total_gpu, h_left, h_right,
-               (uint64_t*)(d_shard_keys_ptr), NULL);
-
-  for (int i = 0; i < total_gpu; ++i) {
-    if (h_left[i] == -1) {
-      continue;
-    }
-    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
-    auto& node = path_[gpu_id][i].nodes_.back();
-    cudaMemsetAsync(node.val_storage, -1, shard_len * sizeof(int64_t),
-                    node.in_stream);
-    cudaStreamSynchronize(node.in_stream);
-    platform::CUDADeviceGuard guard(resource_->dev_id(i));
-    tables_[i]->get(reinterpret_cast<uint64_t*>(node.key_storage),
-                    reinterpret_cast<int64_t*>(node.val_storage),
-                    h_right[i] - h_left[i] + 1,
-                    resource_->remote_stream(i, gpu_id));
-    // node.in_stream);
-    auto graph = gpu_graph_list[i];
-    int64_t* id_array = reinterpret_cast<int64_t*>(node.val_storage);
-    int* actual_size_array = (int*)(id_array + shard_len);
-    int64_t* sample_array =
-        (int64_t*)(actual_size_array + shard_len + shard_len % 2);
-    int sample_grid_size = (shard_len - 1) / dim_y + 1;
-    dim3 block(parallel_sample_size, dim_y);
-    dim3 grid(sample_grid_size);
-    neighbor_sample_example<<<grid, block, 0,
-                              resource_->remote_stream(i, gpu_id)>>>(
-        graph, id_array, actual_size_array, sample_array, sample_size,
-        sample_status[i], shard_len, gpu_id);
-  }
-
-  for (int i = 0; i < total_gpu; ++i) {
-    if (h_left[i] == -1) {
-      continue;
-    }
-    cudaStreamSynchronize(resource_->remote_stream(i, gpu_id));
-  }
-  move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size,
-                                            h_left, h_right, d_shard_vals_ptr,
-                                            d_shard_actual_sample_size_ptr);
-  fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
-      d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size,
-      d_idx_ptr, sample_size, len);
-  for (int i = 0; i < total_gpu; ++i) {
-    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
-    if (shard_len == 0) {
-      continue;
-    }
-    destroy_storage(gpu_id, i);
-  }
-  cudaStreamSynchronize(stream);
-  return result;
+  return graph_neighbor_sample_v2(gpu_id, 0, key, sample_size, len, false);
 }
 
 NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
-    int gpu_id, int64_t* key, int sample_size, int len, bool cpu_query_switch) {
+    int gpu_id, int idx, uint64_t* key, int sample_size, int len,
+    bool cpu_query_switch) {
   NeighborSampleResult result;
   result.initialize(sample_size, len, resource_->dev_id(gpu_id));
 
@@ -777,7 +629,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
   platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
   platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
   int* actual_sample_size = result.actual_sample_size;
-  int64_t* val = result.val;
+  uint64_t* val = result.val;
   int total_gpu = resource_->total_device();
   auto stream = resource_->local_stream(gpu_id, 0);
 
@@ -801,10 +653,11 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
   auto d_idx = memory::Alloc(place, len * sizeof(int));
   int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
 
-  auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t));
-  int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr());
-  auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t));
-  int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
+  auto d_shard_keys = memory::Alloc(place, len * sizeof(uint64_t));
+  uint64_t* d_shard_keys_ptr = reinterpret_cast<uint64_t*>(d_shard_keys->ptr());
+  auto d_shard_vals =
+      memory::Alloc(place, sample_size * len * sizeof(uint64_t));
+  uint64_t* d_shard_vals_ptr = reinterpret_cast<uint64_t*>(d_shard_vals->ptr());
   auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
   int* d_shard_actual_sample_size_ptr =
       reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
@@ -826,8 +679,9 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     if (shard_len == 0) {
       continue;
     }
-    create_storage(gpu_id, i, shard_len * sizeof(int64_t),
-                   shard_len * (1 + sample_size) * sizeof(int64_t) +
+    create_storage(gpu_id, i, shard_len * sizeof(uint64_t),
+                   shard_len * sample_size * sizeof(uint64_t) +
+                       shard_len * sizeof(int64_t) +
                        sizeof(int) * (shard_len + shard_len % 2));
   }
   walk_to_dest(gpu_id, total_gpu, h_left, h_right,
@@ -844,22 +698,24 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     cudaStreamSynchronize(node.in_stream);
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
     // If not found, val is -1.
-    tables_[i]->get(reinterpret_cast<uint64_t*>(node.key_storage),
-                    reinterpret_cast<int64_t*>(node.val_storage),
-                    h_right[i] - h_left[i] + 1,
-                    resource_->remote_stream(i, gpu_id));
-
-    auto graph = gpu_graph_list[i];
+    int table_offset = get_table_offset(i, GraphTableType::EDGE_TABLE, idx);
+    int offset = i * graph_table_num_ + idx;
+    tables_[table_offset]->get(reinterpret_cast<uint64_t*>(node.key_storage),
+                               reinterpret_cast<int64_t*>(node.val_storage),
+                               h_right[i] - h_left[i] + 1,
+                               resource_->remote_stream(i, gpu_id));
+
+    auto graph = gpu_graph_list_[offset];
     int64_t* id_array = reinterpret_cast<int64_t*>(node.val_storage);
     int* actual_size_array = (int*)(id_array + shard_len);
-    int64_t* sample_array =
-        (int64_t*)(actual_size_array + shard_len + shard_len % 2);
+    uint64_t* sample_array =
+        (uint64_t*)(actual_size_array + shard_len + shard_len % 2);
     constexpr int WARP_SIZE = 32;
     constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
     constexpr int TILE_SIZE = BLOCK_WARPS * 16;
     const dim3 block(WARP_SIZE, BLOCK_WARPS);
     const dim3 grid((shard_len + TILE_SIZE - 1) / TILE_SIZE);
-    neighbor_sample_example_v2<
+    neighbor_sample_kernel<
         WARP_SIZE, BLOCK_WARPS,
         TILE_SIZE><<<grid, block, 0, resource_->remote_stream(i, gpu_id)>>>(
         graph, id_array, actual_size_array, sample_array, sample_size,
@@ -872,8 +728,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     }
     cudaStreamSynchronize(resource_->remote_stream(i, gpu_id));
   }
-
-  move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size,
+  move_result_to_source_gpu(gpu_id, total_gpu, sample_size,
                                             h_left, h_right, d_shard_vals_ptr,
                                             d_shard_actual_sample_size_ptr);
   fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
@@ -884,7 +739,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
 
   if (cpu_query_switch) {
     // Get cpu keys and corresponding position.
-    thrust::device_vector<int64_t> t_cpu_keys(len);
+    thrust::device_vector<uint64_t> t_cpu_keys(len);
     thrust::device_vector<int> t_index(len + 1, 0);
     get_cpu_id_index<<<grid_size, block_size_, 0, stream>>>(
         key, actual_sample_size, thrust::raw_pointer_cast(t_cpu_keys.data()),
@@ -897,34 +752,34 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     cudaMemcpy(&number_on_cpu, thrust::raw_pointer_cast(t_index.data()),
                sizeof(int), cudaMemcpyDeviceToHost);
     if (number_on_cpu > 0) {
-      int64_t* cpu_keys = new int64_t[number_on_cpu];
+      uint64_t* cpu_keys = new uint64_t[number_on_cpu];
       cudaMemcpy(cpu_keys, thrust::raw_pointer_cast(t_cpu_keys.data()),
-                 number_on_cpu * sizeof(int64_t), cudaMemcpyDeviceToHost);
+                 number_on_cpu * sizeof(uint64_t), cudaMemcpyDeviceToHost);
 
       std::vector<std::shared_ptr<char>> buffers(number_on_cpu);
       std::vector<int> ac(number_on_cpu);
 
-      auto status = cpu_graph_table->random_sample_neighbors(
-          0, cpu_keys, sample_size, buffers, ac, false);
+      auto status = cpu_graph_table_->random_sample_neighbors(
+          idx, cpu_keys, sample_size, buffers, ac, false);
 
       int total_cpu_sample_size = std::accumulate(ac.begin(), ac.end(), 0);
-      total_cpu_sample_size /= sizeof(int64_t);
+      total_cpu_sample_size /= sizeof(uint64_t);
 
-      // Merge buffers into one int64_t vector.
-      int64_t* merge_buffers = new int64_t[total_cpu_sample_size];
+      // Merge buffers into one uint64_t vector.
+      uint64_t* merge_buffers = new uint64_t[total_cpu_sample_size];
       int start = 0;
       for (int j = 0; j < number_on_cpu; j++) {
-        memcpy(merge_buffers + start, (int64_t*)(buffers[j].get()), ac[j]);
-        start += ac[j] / sizeof(int64_t);
+        memcpy(merge_buffers + start, (uint64_t*)(buffers[j].get()), ac[j]);
+        start += ac[j] / sizeof(uint64_t);
       }
 
       // Copy merge_buffers to gpu.
-      thrust::device_vector<int64_t> gpu_buffers(total_cpu_sample_size);
+      thrust::device_vector<uint64_t> gpu_buffers(total_cpu_sample_size);
       thrust::device_vector<int> gpu_ac(number_on_cpu);
-      int64_t* gpu_buffers_ptr = thrust::raw_pointer_cast(gpu_buffers.data());
+      uint64_t* gpu_buffers_ptr = thrust::raw_pointer_cast(gpu_buffers.data());
       int* gpu_ac_ptr = thrust::raw_pointer_cast(gpu_ac.data());
       cudaMemcpyAsync(gpu_buffers_ptr, merge_buffers,
-                      total_cpu_sample_size * sizeof(int64_t),
+                      total_cpu_sample_size * sizeof(uint64_t),
                       cudaMemcpyHostToDevice, stream);
       cudaMemcpyAsync(gpu_ac_ptr, ac.data(), number_on_cpu * sizeof(int),
                       cudaMemcpyHostToDevice, stream);
@@ -970,8 +825,8 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
                                            t_actual_sample_size.end());
 
     result.actual_val_mem =
-        memory::AllocShared(place, total_sample_size * sizeof(int64_t));
-    result.actual_val = (int64_t*)(result.actual_val_mem)->ptr();
+        memory::AllocShared(place, total_sample_size * sizeof(uint64_t));
+    result.actual_val = (uint64_t*)(result.actual_val_mem)->ptr();
 
     result.set_total_sample_size(total_sample_size);
 
@@ -1001,7 +856,7 @@ NodeQueryResult GpuPsGraphTable::graph_node_sample(int gpu_id,
   return NodeQueryResult();
 }
 
-NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start,
+NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int idx, int start,
                                                  int query_size) {
   NodeQueryResult result;
   if (query_size <= 0) return result;
@@ -1009,24 +864,8 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start,
   actual_size = 0;
   // int dev_id = resource_->dev_id(gpu_id);
   // platform::CUDADeviceGuard guard(dev_id);
-  std::vector<int> idx, gpu_begin_pos, local_begin_pos;
+  std::vector<int> gpu_begin_pos, local_begin_pos;
   int sample_size;
-  /*
-  if idx[i] = a, gpu_begin_pos[i] = p1,
-  gpu_local_begin_pos[i] = p2;
-  sample_size[i] = s;
-  then on gpu a, the nodes of positions [p1,p1 + s) should be returned
-  and saved from the p2 position on the sample_result array
-  for example:
-  suppose
-  gpu 0 saves [0,2,4,6,8], gpu1 saves [1,3,5,7]
-  start = 3, query_size = 5
-  we know [6,8,1,3,5] should be returned;
-  idx = [0,1]
-  gpu_begin_pos = [3,0]
-  local_begin_pos = [0,3]
-  sample_size = [2,3]
-  */
   std::function<int(int, int, int, int, int&, int&)> range_check = [](
       int x, int y, int x1, int y1, int& x2, int& y2) {
     if (y <= x1 || x >= y1) return 0;
@@ -1034,7 +873,7 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start,
     x2 = max(x1, x);
     return y2 - x2;
   };
-  auto graph = gpu_graph_list[gpu_id];
+  auto graph = gpu_graph_list_[gpu_id];
   if (graph.node_size == 0) {
     return result;
   }
@@ -1044,68 +883,128 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start,
   if (len == 0) {
     return result;
   }
-  int64_t* val;
+  uint64_t* val;
   sample_size = len;
   result.initialize(len, resource_->dev_id(gpu_id));
   actual_size = len;
   val = result.val;
   int dev_id_i = resource_->dev_id(gpu_id);
   platform::CUDADeviceGuard guard(dev_id_i);
-  // platform::CUDADeviceGuard guard(i);
   int grid_size = (len - 1) / block_size_ + 1;
+  int offset = gpu_id * graph_table_num_ + idx;
   node_query_example<<<grid_size, block_size_, 0,
                        resource_->remote_stream(gpu_id, gpu_id)>>>(
-      gpu_graph_list[gpu_id], x2, len, (int64_t*)val);
+      gpu_graph_list_[offset], x2, len, (uint64_t*)val);
   cudaStreamSynchronize(resource_->remote_stream(gpu_id, gpu_id));
   return result;
-  /*
-  for (int i = 0; i < gpu_graph_list.size() && query_size != 0; i++) {
-    auto graph = gpu_graph_list[i];
-    if (graph.node_size == 0) {
+}
+
+int GpuPsGraphTable::get_feature_of_nodes(int gpu_id, std::shared_ptr<phi::Allocation> d_nodes,
+        std::shared_ptr<phi::Allocation> d_feature, int node_num, int slot_num) {
+  if (node_num == 0) {
+    return -1;
+  }
+
+  platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
+  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
+  int total_gpu = resource_->total_device();
+  auto stream = resource_->local_stream(gpu_id, 0);
+
+  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
+  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
+  int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
+  int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
+
+  cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
+  cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
+  //
+  auto d_idx = memory::Alloc(place, node_num * sizeof(int));
+  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
+
+  auto d_shard_keys = memory::Alloc(place, node_num * sizeof(uint64_t));
+  uint64_t* d_shard_keys_ptr = reinterpret_cast<uint64_t*>(d_shard_keys->ptr());
+  auto d_shard_vals = memory::Alloc(place, slot_num * node_num * sizeof(uint64_t));
+  uint64_t* d_shard_vals_ptr = reinterpret_cast<uint64_t*>(d_shard_vals->ptr());
+  auto d_shard_actual_size = memory::Alloc(place, node_num * sizeof(int));
+  int* d_shard_actual_size_ptr = reinterpret_cast<int*>(d_shard_actual_size->ptr());
+
+  uint64_t* key = (uint64_t*)d_nodes->ptr();
+  split_input_to_shard((uint64_t*)(key), d_idx_ptr, node_num, d_left_ptr, d_right_ptr, gpu_id);
+
+  heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, key, d_idx_ptr, node_num, stream);
+  cudaStreamSynchronize(stream);
+
+  int h_left[total_gpu];   // NOLINT
+  cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost);
+  int h_right[total_gpu];  // NOLINT
+  cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost);
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    if (shard_len == 0) {
       continue;
     }
-    int x2, y2;
-    int len = range_check(start, start + query_size, size,
-                          size + graph.node_size, x2, y2);
-    if (len > 0) {
-      idx.push_back(i);
-      gpu_begin_pos.emplace_back(x2 - size);
-      local_begin_pos.emplace_back(actual_size);
-      sample_size.push_back(len);
-      actual_size += len;
-      create_storage(gpu_id, i, 1, len * sizeof(int64_t));
-    }
-    size += graph.node_size;
-  }
-  for (int i = 0; i < idx.size(); i++) {
-    int dev_id_i = resource_->dev_id(idx[i]);
-    platform::CUDADeviceGuard guard(dev_id_i);
-    // platform::CUDADeviceGuard guard(i);
-    auto& node = path_[gpu_id][idx[i]].nodes_.front();
-    int grid_size = (sample_size[i] - 1) / block_size_ + 1;
-    node_query_example<<<grid_size, block_size_, 0,
-                         resource_->remote_stream(idx[i], gpu_id)>>>(
-        gpu_graph_list[idx[i]], gpu_begin_pos[i], sample_size[i],
-        (int64_t*)node.val_storage);
+    create_storage(gpu_id, i, shard_len * sizeof(uint64_t),
+            shard_len * slot_num * sizeof(uint64_t) + shard_len * sizeof(int64_t)
+            + sizeof(int) * (shard_len + shard_len % 2));
   }
 
-  for (int i = 0; i < idx.size(); i++) {
-    cudaStreamSynchronize(resource_->remote_stream(idx[i], gpu_id));
-    auto& node = path_[gpu_id][idx[i]].nodes_.front();
-    cudaMemcpyAsync(reinterpret_cast<char*>(val + local_begin_pos[i]),
-                    node.val_storage, node.val_bytes_len, cudaMemcpyDefault,
-                    node.out_stream);
+  walk_to_dest(gpu_id, total_gpu, h_left, h_right, (uint64_t*)(d_shard_keys_ptr), NULL);
+
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    auto& node = path_[gpu_id][i].nodes_.back();
+    cudaMemsetAsync(node.val_storage, -1, shard_len * sizeof(int64_t), node.in_stream);
+    cudaStreamSynchronize(node.in_stream);
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    // If not found, val is -1.
+    int table_offset = get_table_offset(i, GraphTableType::FEATURE_TABLE, 0);
+    tables_[table_offset]->get(reinterpret_cast<uint64_t*>(node.key_storage),
+                               reinterpret_cast<int64_t*>(node.val_storage),
+                               h_right[i] - h_left[i] + 1,
+                               resource_->remote_stream(i, gpu_id));
+
+    int offset = i * feature_table_num_;
+    auto graph = gpu_graph_fea_list_[offset];
+    int64_t* val_array = reinterpret_cast<int64_t*>(node.val_storage);
+    int* actual_size_array = (int*)(val_array + shard_len);
+    uint64_t* feature_array = (uint64_t*)(actual_size_array + shard_len + shard_len % 2);
+    dim3 grid((shard_len - 1) / dim_y + 1);
+    dim3 block(1, dim_y);
+    get_features_kernel<<<grid, block, 0, resource_->remote_stream(i, gpu_id)>>>(
+        graph, val_array, actual_size_array, feature_array, slot_num, shard_len);
   }
-  for (int i = 0; i < idx.size(); i++) {
-    auto& node = path_[gpu_id][idx[i]].nodes_.front();
-    cudaStreamSynchronize(node.out_stream);
+
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    cudaStreamSynchronize(resource_->remote_stream(i, gpu_id));
   }
-  for (auto x : idx) {
-    destroy_storage(gpu_id, x);
+
+  move_result_to_source_gpu(gpu_id, total_gpu, slot_num, h_left, h_right,
+                            d_shard_vals_ptr, d_shard_actual_size_ptr);
+
+  int grid_size = (node_num - 1) / block_size_ + 1;
+  uint64_t* result = (uint64_t*)d_feature->ptr();
+  fill_dvalues<<<grid_size, block_size_, 0, stream>>>(d_shard_vals_ptr, result,
+          d_shard_actual_size_ptr, d_idx_ptr, slot_num, node_num);
+
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    if (shard_len == 0) {
+      continue;
+    }
+    destroy_storage(gpu_id, i);
   }
-  return result;
-  */
+
+  cudaStreamSynchronize(stream);
+
+  return 0;
 }
+
 }
 };
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
index c976bb67cb21e1..4a4b9929370910 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <sstream>
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
 namespace paddle {
 namespace framework {
@@ -25,11 +27,26 @@ void GraphGpuWrapper::set_device(std::vector<int> ids) {
     device_id_mapping.push_back(device_id);
   }
 }
-std::vector<std::vector<int64_t>> GraphGpuWrapper::get_all_id(int type, int idx,
+
+std::vector<std::vector<uint64_t>> GraphGpuWrapper::get_all_id(int type,
                                                               int slice_num) {
   return ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->get_all_id(type, idx, slice_num);
+      ->cpu_graph_table_->get_all_id(type, slice_num);
 }
+
+std::vector<std::vector<uint64_t>> GraphGpuWrapper::get_all_id(int type,
+                                                               int idx,
+                                                               int slice_num) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table_->get_all_id(type, idx, slice_num);
+}
+
+int GraphGpuWrapper::get_all_feature_ids(int type, int idx, int slice_num,
+                                        std::vector<std::vector<uint64_t>>* output) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table_->get_all_feature_ids(type, idx, slice_num, output);
+}
+
 void GraphGpuWrapper::set_up_types(std::vector<std::string> &edge_types,
                                    std::vector<std::string> &node_types) {
   id_to_edge = edge_types;
@@ -48,31 +65,39 @@ void GraphGpuWrapper::set_up_types(std::vector<std::string> &edge_types,
   this->table_feat_conf_feat_shape.resize(node_types.size());
 }
 
+void GraphGpuWrapper::set_feature_separator(std::string ch) {
+  feature_separator_ = ch;
+  if (graph_table != nullptr) {
+    ((GpuPsGraphTable *)graph_table)
+        ->cpu_graph_table_->set_feature_separator(feature_separator_);
+  }
+}
+
 void GraphGpuWrapper::make_partitions(int idx, int64_t byte_size,
                                       int device_len) {
   ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->make_partitions(idx, byte_size, device_len);
+      ->cpu_graph_table_->make_partitions(idx, byte_size, device_len);
 }
 int32_t GraphGpuWrapper::load_next_partition(int idx) {
   return ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->load_next_partition(idx);
+      ->cpu_graph_table_->load_next_partition(idx);
 }
 
 void GraphGpuWrapper::set_search_level(int level) {
-  ((GpuPsGraphTable *)graph_table)->cpu_graph_table->set_search_level(level);
+  ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->set_search_level(level);
 }
 
-std::vector<int64_t> GraphGpuWrapper::get_partition(int idx, int num) {
+std::vector<uint64_t> GraphGpuWrapper::get_partition(int idx, int num) {
   return ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->get_partition(idx, num);
+      ->cpu_graph_table_->get_partition(idx, num);
 }
 int32_t GraphGpuWrapper::get_partition_num(int idx) {
   return ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->get_partition_num(idx);
+      ->cpu_graph_table_->get_partition_num(idx);
 }
 void GraphGpuWrapper::make_complementary_graph(int idx, int64_t byte_size) {
   ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->make_complementary_graph(idx, byte_size);
+      ->cpu_graph_table_->make_complementary_graph(idx, byte_size);
 }
 void GraphGpuWrapper::load_edge_file(std::string name, std::string filepath,
                                      bool reverse) {
@@ -87,7 +112,7 @@ void GraphGpuWrapper::load_edge_file(std::string name, std::string filepath,
   }
   if (edge_to_id.find(name) != edge_to_id.end()) {
     ((GpuPsGraphTable *)graph_table)
-        ->cpu_graph_table->Load(std::string(filepath), params);
+        ->cpu_graph_table_->Load(std::string(filepath), params);
   }
 }
 
@@ -98,7 +123,7 @@ void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) {
 
   if (feature_to_id.find(name) != feature_to_id.end()) {
     ((GpuPsGraphTable *)graph_table)
-        ->cpu_graph_table->Load(std::string(filepath), params);
+        ->cpu_graph_table_->Load(std::string(filepath), params);
   }
 }
 
@@ -134,8 +159,9 @@ void GraphGpuWrapper::init_search_level(int level) { search_level = level; }
 
 void GraphGpuWrapper::init_service() {
   table_proto.set_task_pool_size(24);
+  table_proto.set_shard_num(1000);
   table_proto.set_search_level(search_level);
-  table_proto.set_table_name("cpu_graph_table");
+  table_proto.set_table_name("cpu_graph_table_");
   table_proto.set_use_cache(false);
   for (int i = 0; i < id_to_edge.size(); i++)
     table_proto.add_edge_types(id_to_edge[i]);
@@ -152,74 +178,95 @@ void GraphGpuWrapper::init_service() {
   std::shared_ptr<HeterPsResource> resource =
       std::make_shared<HeterPsResource>(device_id_mapping);
   resource->enable_p2p();
-  GpuPsGraphTable *g = new GpuPsGraphTable(resource, 1);
+  GpuPsGraphTable *g =
+      new GpuPsGraphTable(resource, 1, id_to_edge.size());
   g->init_cpu_table(table_proto);
+  g->cpu_graph_table_->set_feature_separator(feature_separator_);
   graph_table = (char *)g;
 }
 
 void GraphGpuWrapper::upload_batch(int idx,
-                                   std::vector<std::vector<int64_t>> &ids) {
+                                   std::vector<std::vector<uint64_t>> &ids) {
+  debug_gpu_memory_info("upload_batch node start");
   GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table;
-  // std::vector<paddle::framework::GpuPsCommGraph> vec;
   for (int i = 0; i < ids.size(); i++) {
-    // vec.push_back(g->cpu_graph_table->make_gpu_ps_graph(idx, ids[i]));
     GpuPsCommGraph sub_graph =
-        g->cpu_graph_table->make_gpu_ps_graph(idx, ids[i]);
-    g->build_graph_on_single_gpu(sub_graph, i);
+        g->cpu_graph_table_->make_gpu_ps_graph(idx, ids[i]);
+    // sub_graph.display_on_cpu();
+    g->build_graph_on_single_gpu(sub_graph, i, idx);
     sub_graph.release_on_cpu();
     VLOG(0) << "sub graph on gpu " << i << " is built";
   }
-  // g->build_graph_from_cpu(vec);
+  debug_gpu_memory_info("upload_batch node end");
 }
 
-// void GraphGpuWrapper::test() {
-//   int64_t cpu_key[3] = {0, 1, 2};
-//   void *key;
-//   platform::CUDADeviceGuard guard(0);
-//   cudaMalloc((void **)&key, 3 * sizeof(int64_t));
-//   cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
-//   auto neighbor_sample_res =
-//       ((GpuPsGraphTable *)graph_table)
-//           ->graph_neighbor_sample(0, (int64_t *)key, 2, 3);
-//   int64_t *res = new int64_t[7];
-//   cudaMemcpy(res, neighbor_sample_res.val, 3 * 2 * sizeof(int64_t),
-//              cudaMemcpyDeviceToHost);
-//   int *actual_sample_size = new int[3];
-//   cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size,
-//              3 * sizeof(int),
-//              cudaMemcpyDeviceToHost);  // 3, 1, 3
+// feature table
+void GraphGpuWrapper::upload_batch(std::vector<std::vector<uint64_t>> &node_ids,
+                                   int slot_num) {
+  debug_gpu_memory_info("upload_batch feature start");
+  GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table;
+  for (int i = 0; i < node_ids.size(); i++) {
+    VLOG(0) << "begin make_gpu_ps_graph_fea, node_ids[" << i << "]_size["
+            << node_ids[i].size() << "]";
+    GpuPsCommGraphFea sub_graph = g->cpu_graph_table_->make_gpu_ps_graph_fea(
+        node_ids[i], slot_num);
+
+    // sub_graph.display_on_cpu();
+    VLOG(0) << "begin build_graph_fea_on_single_gpu, node_ids[" << i
+            << "]_size[" << node_ids[i].size() << "]";
+    g->build_graph_fea_on_single_gpu(sub_graph, i);
+
+    sub_graph.release_on_cpu();
+
+    VLOG(0) << "sub graph fea on gpu " << i << " is built";
+  }
+  // g->build_graph_from_cpu(vec);
+  debug_gpu_memory_info("upload_batch feature end");
+}
 
-//   //{0,9} or {9,0} is expected for key 0
-//   //{0,2} or {2,0} is expected for key 1
-//   //{1,3} or {3,1} is expected for key 2
-//   for (int i = 0; i < 3; i++) {
-//     VLOG(0) << "actual sample size for " << i << " is "
-//             << actual_sample_size[i];
-//     for (int j = 0; j < actual_sample_size[i]; j++) {
-//       VLOG(0) << "sampled an neighbor for node" << i << " : " << res[i * 2 +
-//       j];
-//     }
-//   }
-// }
 NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample_v3(
     NeighborSampleQuery q, bool cpu_switch) {
   return ((GpuPsGraphTable *)graph_table)
       ->graph_neighbor_sample_v3(q, cpu_switch);
 }
 
+int GraphGpuWrapper::get_feature_of_nodes(int gpu_id,
+        std::shared_ptr<phi::Allocation> d_walk,
+        std::shared_ptr<phi::Allocation> d_offset, uint32_t size, int slot_num) const {
+  platform::CUDADeviceGuard guard(gpu_id);
+  PADDLE_ENFORCE_NOT_NULL(graph_table);
+  return ((GpuPsGraphTable *)graph_table)
+      ->get_feature_of_nodes(gpu_id, d_walk, d_offset, size, slot_num);
+}
+
+NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample(
+    int gpu_id, uint64_t *device_keys, int walk_degree, int len) {
+  platform::CUDADeviceGuard guard(gpu_id);
+  auto neighbor_sample_res =
+      ((GpuPsGraphTable *)graph_table)
+          ->graph_neighbor_sample(gpu_id, device_keys, walk_degree, len);
+
+  return neighbor_sample_res;
+}
+
 // this function is contributed by Liwb5
-std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
-    int gpu_id, std::vector<int64_t> &key, int sample_size) {
-  int64_t *cuda_key;
+std::vector<uint64_t> GraphGpuWrapper::graph_neighbor_sample(
+    int gpu_id, int idx, std::vector<uint64_t> &key, int sample_size) {
+  std::vector<uint64_t> res;
+  if (key.size() == 0) {
+    return res;
+  }
+  uint64_t *cuda_key;
   platform::CUDADeviceGuard guard(gpu_id);
 
-  cudaMalloc(&cuda_key, key.size() * sizeof(int64_t));
-  cudaMemcpy(cuda_key, key.data(), key.size() * sizeof(int64_t),
+  cudaMalloc(&cuda_key, key.size() * sizeof(uint64_t));
+  cudaMemcpy(cuda_key, key.data(), key.size() * sizeof(uint64_t),
              cudaMemcpyHostToDevice);
-
+  VLOG(0) << "key_size: " << key.size();
   auto neighbor_sample_res =
       ((GpuPsGraphTable *)graph_table)
-          ->graph_neighbor_sample(gpu_id, cuda_key, sample_size, key.size());
+          ->graph_neighbor_sample_v2(gpu_id, idx, cuda_key, sample_size,
+                                     key.size(), false);
   int *actual_sample_size = new int[key.size()];
   cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size,
              key.size() * sizeof(int),
@@ -229,11 +276,11 @@ std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
     cumsum += actual_sample_size[i];
   }
 
-  std::vector<int64_t> cpu_key, res;
+  std::vector<uint64_t> cpu_key;
   cpu_key.resize(key.size() * sample_size);
 
   cudaMemcpy(cpu_key.data(), neighbor_sample_res.val,
-             key.size() * sample_size * sizeof(int64_t),
+             key.size() * sample_size * sizeof(uint64_t),
              cudaMemcpyDeviceToHost);
   for (int i = 0; i < key.size(); i++) {
     for (int j = 0; j < actual_sample_size[i]; j++) {
@@ -249,26 +296,19 @@ std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
   return res;
 }
 
-void GraphGpuWrapper::init_sample_status() {
-  ((GpuPsGraphTable *)graph_table)->init_sample_status();
-}
-
-void GraphGpuWrapper::free_sample_status() {
-  ((GpuPsGraphTable *)graph_table)->free_sample_status();
-}
-NodeQueryResult GraphGpuWrapper::query_node_list(int gpu_id, int start,
+NodeQueryResult GraphGpuWrapper::query_node_list(int gpu_id, int idx, int start,
                                                  int query_size) {
   return ((GpuPsGraphTable *)graph_table)
-      ->query_node_list(gpu_id, start, query_size);
+      ->query_node_list(gpu_id, idx, start, query_size);
 }
 void GraphGpuWrapper::load_node_weight(int type_id, int idx, std::string path) {
   return ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->load_node_weight(type_id, idx, path);
+      ->cpu_graph_table_->load_node_weight(type_id, idx, path);
 }
 
 void GraphGpuWrapper::export_partition_files(int idx, std::string file_path) {
   return ((GpuPsGraphTable *)graph_table)
-      ->cpu_graph_table->export_partition_files(idx, file_path);
+      ->cpu_graph_table_->export_partition_files(idx, file_path);
 }
 #endif
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
index a34e752fc7ea7d..7de234a8703169 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -31,12 +31,13 @@ class GraphGpuWrapper {
   }
   static std::shared_ptr<GraphGpuWrapper> s_instance_;
   void initialize();
-  void test();
   void set_device(std::vector<int> ids);
   void init_service();
   void set_up_types(std::vector<std::string>& edge_type,
                     std::vector<std::string>& node_type);
-  void upload_batch(int idx, std::vector<std::vector<int64_t>>& ids);
+  void upload_batch(int etype_id, std::vector<std::vector<uint64_t>>& ids);
+  void upload_batch(std::vector<std::vector<uint64_t>>& ids,
+                    int slot_num);
   void add_table_feat_conf(std::string table_name, std::string feat_name,
                            std::string feat_dtype, int feat_shape);
   void load_edge_file(std::string name, std::string filepath, bool reverse);
@@ -45,22 +46,30 @@ class GraphGpuWrapper {
   int32_t get_partition_num(int idx);
   void load_node_weight(int type_id, int idx, std::string path);
   void export_partition_files(int idx, std::string file_path);
-  std::vector<int64_t> get_partition(int idx, int num);
+  std::vector<uint64_t> get_partition(int idx, int num);
   void make_partitions(int idx, int64_t byte_size, int device_len);
   void make_complementary_graph(int idx, int64_t byte_size);
   void set_search_level(int level);
   void init_search_level(int level);
-  std::vector<std::vector<int64_t>> get_all_id(int type, int idx,
-                                               int slice_num);
-  NodeQueryResult query_node_list(int gpu_id, int start, int query_size);
+  std::vector<std::vector<uint64_t>> get_all_id(int type, int slice_num);
+  std::vector<std::vector<uint64_t>> get_all_id(int type, int idx,
+                                                int slice_num);
+  int get_all_feature_ids(int type, int idx, int slice_num,
+                        std::vector<std::vector<uint64_t>>* output);
+  NodeQueryResult query_node_list(int gpu_id, int idx, int start,
+                                  int query_size);
   NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q,
                                                 bool cpu_switch);
-  std::vector<int64_t> graph_neighbor_sample(int gpu_id,
-                                             std::vector<int64_t>& key,
-                                             int sample_size);
+  NeighborSampleResult graph_neighbor_sample(int gpu_id, uint64_t* device_keys,
+                                             int walk_degree, int len);
+  std::vector<uint64_t> graph_neighbor_sample(int gpu_id, int idx,
+                                              std::vector<uint64_t>& key,
+                                              int sample_size);
+  void set_feature_separator(std::string ch);
+  int get_feature_of_nodes(int gpu_id,
+          std::shared_ptr<phi::Allocation> d_walk,
+          std::shared_ptr<phi::Allocation> d_offset, uint32_t size, int slot_num) const;
 
-  void init_sample_status();
-  void free_sample_status();
   std::unordered_map<std::string, int> edge_to_id, feature_to_id;
   std::vector<std::string> id_to_feature, id_to_edge;
   std::vector<std::unordered_map<std::string, int>> table_feat_mapping;
@@ -71,6 +80,7 @@ class GraphGpuWrapper {
   std::vector<int> device_id_mapping;
   int search_level = 1;
   void* graph_table;
+  std::string feature_separator_ = std::string(" ");
 };
 #endif
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h b/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h
index a7c043f1edf375..335508217fb04a 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h
@@ -81,10 +81,10 @@ class CommonGraphSampler : public GraphSampler {
   virtual void init(GpuPsGraphTable *g, std::vector<std::string> args);
   GpuPsGraphTable *gpu_table;
   paddle::distributed::GraphTable *table;
-  std::vector<int64_t> gpu_edges_count;
-  int64_t cpu_edges_count;
-  int64_t gpu_edges_limit, cpu_edges_limit, gpu_edges_each_limit;
-  std::vector<std::unordered_set<int64_t>> gpu_set;
+  std::vector<uint64_t> gpu_edges_count;
+  uint64_t cpu_edges_count;
+  uint64_t gpu_edges_limit, cpu_edges_limit, gpu_edges_each_limit;
+  std::vector<std::unordered_set<uint64_t>> gpu_set;
   int gpu_num;
 };
 
@@ -101,7 +101,7 @@ class AllInGpuGraphSampler : public GraphSampler {
   paddle::distributed::GraphTable *graph_table;
   GpuPsGraphTable *gpu_table;
   std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
-  std::vector<std::vector<int64_t>> sample_neighbors;
+  std::vector<std::vector<uint64_t>> sample_neighbors;
   std::vector<GpuPsCommGraph> sample_res;
   // std::shared_ptr<std::mt19937_64> random;
   int gpu_num;
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h
index ad4b00b11aa39f..ae05398c148444 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h
@@ -24,13 +24,14 @@ int CommonGraphSampler::load_from_ssd(std::string path) {
     std::cout << values.size();
     if (values.size() < 2) continue;
     auto neighbors = paddle::string::split_string<std::string>(values[1], ";");
-    std::vector<int64_t> neighbor_data;
+    std::vector<uint64_t> neighbor_data;
     for (auto x : neighbors) {
       neighbor_data.push_back(std::stoll(x));
     }
     auto src_id = std::stoll(values[0]);
-    _db->put(0, (char *)&src_id, sizeof(uint64_t), (char *)neighbor_data.data(),
-             sizeof(int64_t) * neighbor_data.size());
+    _db->put(0, (char *)&src_id, sizeof(uuint64_t),
+             (char *)neighbor_data.data(),
+             sizeof(uint64_t) * neighbor_data.size());
     int gpu_shard = src_id % gpu_num;
     if (gpu_edges_count[gpu_shard] + neighbor_data.size() <=
         gpu_edges_each_limit) {
@@ -49,7 +50,7 @@ int CommonGraphSampler::load_from_ssd(std::string path) {
     }
     std::vector<paddle::framework::GpuPsCommGraph> graph_list;
     for (int i = 0; i < gpu_num; i++) {
-      std::vector<int64_t> ids(gpu_set[i].begin(), gpu_set[i].end());
+      std::vector<uint64_t> ids(gpu_set[i].begin(), gpu_set[i].end());
       graph_list.push_back(table->make_gpu_ps_graph(ids));
     }
     gpu_table->build_graph_from_cpu(graph_list);
@@ -69,9 +70,9 @@ void CommonGraphSampler::init(GpuPsGraphTable *g,
   gpu_edges_each_limit = gpu_edges_limit / gpu_num;
   if (gpu_edges_each_limit > INT_MAX) gpu_edges_each_limit = INT_MAX;
   table = g->cpu_graph_table.get();
-  gpu_edges_count = std::vector<int64_t>(gpu_num, 0);
+  gpu_edges_count = std::vector<uint64_t>(gpu_num, 0);
   cpu_edges_count = 0;
-  gpu_set = std::vector<std::unordered_set<int64_t>>(gpu_num);
+  gpu_set = std::vector<std::unordered_set<uint64_t>>(gpu_num);
 }
 
 int AllInGpuGraphSampler::run_graph_sampling() { return 0; }
@@ -85,7 +86,7 @@ int AllInGpuGraphSampler::load_from_ssd(std::string path) {
   sample_res.resize(gpu_num);
   std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>>
       sample_nodes_ex(graph_table->task_pool_size_);
-  std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex(
+  std::vector<std::vector<std::vector<uint64_t>>> sample_neighbors_ex(
       graph_table->task_pool_size_);
   for (int i = 0; i < graph_table->task_pool_size_; i++) {
     sample_nodes_ex[i].resize(gpu_num);
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index e2f362d4074589..234aa15ebf74d1 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -118,8 +118,8 @@ class HashTable {
               StreamType stream);
 
   template <typename StreamType>
-  void insert(const KeyType* d_keys, size_t len, char* pool, size_t start_index,
-              StreamType stream);
+  void insert(const KeyType* d_keys, size_t len, char* pool,
+              size_t feature_value_size, size_t start_index, StreamType stream);
 
   template <typename StreamType>
   void get(const KeyType* d_keys, ValType* d_vals, size_t len,
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index 5edc218796ef8a..81da79b768218f 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -50,7 +50,8 @@ __global__ void insert_kernel(Table* table,
 template <typename Table>
 __global__ void insert_kernel(Table* table,
                               const typename Table::key_type* const keys,
-                              size_t len, char* pool, int start_index) {
+                              size_t len, char* pool, size_t feature_value_size,
+                              int start_index) {
   ReplaceOp<typename Table::mapped_type> op;
   thrust::pair<typename Table::key_type, typename Table::mapped_type> kv;
 
@@ -58,7 +59,8 @@ __global__ void insert_kernel(Table* table,
 
   if (i < len) {
     kv.first = keys[i];
-    kv.second = (Table::mapped_type)(pool + (start_index + i) * 80);
+    uint64_t offset = uint64_t(start_index + i) * feature_value_size;
+    kv.second = (Table::mapped_type)(pool + offset);
     auto it = table->insert(kv, op);
     assert(it != table->end() && "error: insert fails: table is full");
   }
@@ -81,14 +83,29 @@ __global__ void search_kernel(Table* table,
 template <typename Table>
 __global__ void dy_mf_search_kernel(Table* table,
                                     const typename Table::key_type* const keys,
-                                    char* const vals, size_t len,
+                                    char* vals, size_t len,
                                     size_t pull_feature_value_size) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  // return;
   if (i < len) {
     auto it = table->find(keys[i]);
 
     if (it != table->end()) {
-      *(FeatureValue*)(vals + i * pull_feature_value_size) = *(it->second);
+      uint64_t offset = i * pull_feature_value_size;
+      FeatureValue* cur = (FeatureValue*)(vals + offset);
+      FeatureValue& input = *(FeatureValue*)(it->second);
+      cur->slot = input.slot;
+      cur->show = input.show;
+      cur->clk = input.clk;
+      cur->mf_dim = input.mf_dim;
+      cur->lr = input.lr;
+      cur->mf_size = input.mf_size;
+      cur->cpu_ptr = input.cpu_ptr;
+      cur->delta_score = input.delta_score;
+      cur->lr_g2sum = input.lr_g2sum;
+      for (int j = 0; j < cur->mf_dim + 1; ++j) {
+        cur->mf[j] = input.mf[j];
+      }
     }
   }
 }
@@ -121,7 +138,7 @@ __global__ void dy_mf_update_kernel(Table* table,
       FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size);
       sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur);
     } else {
-      printf("yxf::push miss key: %d", keys[i]);
+      printf("warning: push miss key: %d", keys[i]);
     }
   }
 }
@@ -201,7 +218,8 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
 template <typename KeyType, typename ValType>
 template <typename StreamType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
-                                         char* pool, size_t start_index,
+                                         char* pool, size_t feature_value_size,
+                                         size_t start_index,
                                          StreamType stream) {
   if (len == 0) {
     return;
@@ -210,8 +228,8 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
     return;
   }
   const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
-  insert_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys, len,
-                                                       pool, start_index);
+  insert_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+      container_, d_keys, len, pool, feature_value_size, start_index);
 }
 
 template <typename KeyType, typename ValType>
@@ -319,9 +337,12 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
 }
 
 template class HashTable<unsigned long, paddle::framework::FeatureValue>;
+template class HashTable<unsigned long, paddle::framework::FeatureValue*>;
 template class HashTable<long, int>;
 template class HashTable<unsigned long, int>;
 template class HashTable<unsigned long, unsigned long>;
+template class HashTable<unsigned long, long>;
+template class HashTable<unsigned long, long*>;
 template class HashTable<long, long>;
 template class HashTable<long, unsigned long>;
 template class HashTable<long, unsigned int>;
@@ -331,12 +352,18 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
                   paddle::framework::FeatureValue* d_vals, size_t len,
                   cudaStream_t stream);
 
+template void
+HashTable<unsigned long, paddle::framework::FeatureValue*>::get<cudaStream_t>(
+    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream);
+
 template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
                                                       int* d_vals, size_t len,
                                                       cudaStream_t stream);
 
 template void HashTable<unsigned long, int>::get<cudaStream_t>(
     const unsigned long* d_keys, int* d_vals, size_t len, cudaStream_t stream);
+template void HashTable<unsigned long, long>::get<cudaStream_t>(
+    const unsigned long* d_keys, long* d_vals, size_t len, cudaStream_t stream);
 template void HashTable<long, unsigned long>::get<cudaStream_t>(
     const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream);
 template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys,
@@ -354,6 +381,11 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert<
                   const paddle::framework::FeatureValue* d_vals, size_t len,
                   cudaStream_t stream);
 
+template void HashTable<unsigned long, paddle::framework::FeatureValue*>::
+    insert<cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
+                         size_t feature_value_size, size_t start_index,
+                         cudaStream_t stream);
+
 template void HashTable<long, int>::insert<cudaStream_t>(const long* d_keys,
                                                          const int* d_vals,
                                                          size_t len,
@@ -366,6 +398,11 @@ template void HashTable<long, long>::insert<cudaStream_t>(const long* d_keys,
 template void HashTable<unsigned long, int>::insert<cudaStream_t>(
     const unsigned long* d_keys, const int* d_vals, size_t len,
     cudaStream_t stream);
+
+template void HashTable<unsigned long, long>::insert<cudaStream_t>(
+    const unsigned long* d_keys, const long* d_vals, size_t len,
+    cudaStream_t stream);
+
 template void HashTable<long, unsigned long>::insert<cudaStream_t>(
     const long* d_keys, const unsigned long* d_vals, size_t len,
     cudaStream_t stream);
@@ -374,11 +411,6 @@ template void HashTable<long, unsigned int>::insert<cudaStream_t>(
     const long* d_keys, const unsigned int* d_vals, size_t len,
     cudaStream_t stream);
 
-// template void HashTable<unsigned long,
-// paddle::framework::FeatureValue>::insert<
-//    cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
-//                  size_t start_index, cudaStream_t stream);
-
 template void HashTable<unsigned long, paddle::framework::FeatureValue>::
     dump_to_cpu<cudaStream_t>(int devid, cudaStream_t stream);
 
@@ -393,6 +425,16 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::update<
                                   sgd,
                   cudaStream_t stream);
 
+template void
+HashTable<unsigned long, paddle::framework::FeatureValue*>::update<
+    Optimizer<paddle::framework::FeatureValue,
+              paddle::framework::FeaturePushValue>,
+    cudaStream_t>(const unsigned long* d_keys, const char* d_grads, size_t len,
+                  Optimizer<paddle::framework::FeatureValue,
+                            paddle::framework::FeaturePushValue>
+                      sgd,
+                  cudaStream_t stream);
+
 // template void HashTable<unsigned long,
 // paddle::framework::FeatureValue>::update<
 //    Optimizer<paddle::framework::FeatureValue,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index e53a962c5abdee..e001a0823562f8 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -15,10 +15,13 @@ limitations under the License. */
 #pragma once
 #include <thread>
 #include <vector>
+#include "cub/cub.cuh"
+#include "cub/util_allocator.cuh"
 #if defined(PADDLE_WITH_CUDA)
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/fluid/platform/timer.h"
 #include "thrust/pair.h"
 #elif defined(PADDLE_WITH_XPU_KP)
 // #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
@@ -38,6 +41,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+#define TYPEALIGN(ALIGNVAL, LEN) \
+  (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
+
 template <typename KeyType, typename ValType, typename GradType>
 class HeterComm {
  public:
@@ -50,9 +56,13 @@ class HeterComm {
                             int* left, int* right, int gpu_num);
   void merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
                   int& uniq_len);  // NOLINT
+  void dynamic_merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads,
+                          size_t len, int& uniq_len);
   void pull_sparse(int num, KeyType* d_keys, ValType* d_vals, size_t len);
   void build_ps(int num, KeyType* h_keys, ValType* h_vals, size_t len,
-                size_t chunk_size, int stream_num);
+                size_t chunk_size, int stream_num, int offset = -1);
+  void build_ps(int num, KeyType* h_keys, char* pool, size_t len,
+                size_t feature_value_size, size_t chunk_size, int stream_num);
   void dump();
   void show_one_table(int gpu_num);
   int get_index_by_devid(int devid);
@@ -96,6 +106,11 @@ class HeterComm {
     nccl_inter_comms_ = inter_comms;
     node_size_ = comm_size;
   }
+
+  void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) {
+    multi_mf_dim_ = multi_mf_dim;
+    max_mf_dim_ = max_mf_dim;
+  }
 #endif
 
   bool need_transfer(int send_id, int receive_id) {
@@ -114,8 +129,8 @@ class HeterComm {
     char* key_storage;
     char* val_storage;
     int sync;
-    int key_bytes_len;
-    int val_bytes_len;
+    size_t key_bytes_len;
+    size_t val_bytes_len;
     int dev_num;
   };
 
@@ -202,16 +217,22 @@ class HeterComm {
 #endif
   }
 
-  void create_storage(int start_index, int end_index, int keylen, int vallen);
+  void create_storage(int start_index, int end_index, size_t keylen, size_t vallen);
   void destroy_storage(int start_index, int end_index);
   void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right,
                     KeyType* src_key, GradType* src_val);
+  void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right,
+                    KeyType* src_key, char* src_val, size_t val_size);
   void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right,
                    ValType* src_val);
+  void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right,
+                   char* src_val, size_t val_size);
 
  protected:
   using Table = HashTable<KeyType, ValType>;
+  using PtrTable = HashTable<KeyType, ValType*>;
   std::vector<Table*> tables_;
+  std::vector<PtrTable*> ptr_tables_;
   std::shared_ptr<HeterPsResource> resource_;
   std::vector<std::vector<Path>> path_;
   float load_factor_{0.75};
@@ -221,6 +242,7 @@ class HeterComm {
  private:
   int topo_aware_{0};
   std::vector<LocalStorage> storage_;
+  DynamicGradMerger merger_;
   int feanum_{1800 * 2048};
   int multi_node_{0};
   int node_size_;
@@ -228,6 +250,8 @@ class HeterComm {
 #if defined(PADDLE_WITH_CUDA)
   std::vector<ncclComm_t> nccl_inner_comms_;
   std::vector<ncclComm_t> nccl_inter_comms_;
+  int multi_mf_dim_{8};
+  int max_mf_dim_ = 8;
   std::vector<std::shared_ptr<cub::CachingDeviceAllocator>> allocators_;
 #endif
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index d23719ea9eb774..9229076e7fd7ff 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
 #include <queue>
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
 #include "paddle/fluid/platform/device_context.h"
 #ifdef PADDLE_WITH_XPU_KP
@@ -22,20 +23,32 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-
 template <typename KeyType, typename ValType, typename GradType>
 HeterComm<KeyType, ValType, GradType>::HeterComm(
     size_t capacity, std::shared_ptr<HeterPsResource> resource) {
+  VLOG(1) << "Construct new HeterComm";
   resource_ = resource;
   storage_.resize(resource_->total_device());
+  multi_mf_dim_ = resource->multi_mf();
   for (int i = 0; i < resource_->total_device(); ++i) {
 #if defined(PADDLE_WITH_CUDA)
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
     allocators_.push_back(std::make_shared<cub::CachingDeviceAllocator>(
         8, 1, (unsigned int)-1, (size_t)-1, false, false));  // NOLINT
 #endif
-    auto table = new Table(capacity / load_factor_);
-    tables_.push_back(table);
+    if (!multi_mf_dim_) {
+      auto table = new Table(capacity / load_factor_);
+      tables_.push_back(table);
+    } else {
+      max_mf_dim_ = resource_->max_mf_dim();
+      size_t val_type_size = TYPEALIGN(
+          8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1));
+      size_t grad_type_size = TYPEALIGN(
+          8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+      auto ptr_table = new PtrTable(capacity / load_factor_);
+      ptr_table->set_feature_value_size(val_type_size, grad_type_size);
+      ptr_tables_.push_back(ptr_table);
+    }
     if (multi_node_) {
       storage_[i].init(feanum_, resource_->dev_id(i));
     }
@@ -115,21 +128,21 @@ void HeterComm<KeyType, ValType, GradType>::memory_copy(
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::create_storage(int start_index,
                                                            int end_index,
-                                                           int keylen,
-                                                           int vallen) {
+                                                           size_t keylen,
+                                                           size_t vallen) {
 #if defined(PADDLE_WITH_CUDA)
   auto& allocator = allocators_[start_index];
   auto& nodes = path_[start_index][end_index].nodes_;
   for (size_t i = 0; i < nodes.size(); ++i) {
     platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].dev_num));
-    allocator->DeviceAllocate(
+    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate(
         resource_->dev_id(nodes[i].dev_num),
         (void**)&(nodes[i].key_storage),  // NOLINT
-        keylen, resource_->remote_stream(nodes[i].dev_num, start_index));
-    allocator->DeviceAllocate(
+        keylen, resource_->remote_stream(nodes[i].dev_num, start_index)));
+    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate(
         resource_->dev_id(nodes[i].dev_num),
         (void**)&(nodes[i].val_storage),  // NOLINT
-        vallen, resource_->remote_stream(nodes[i].dev_num, start_index));
+        vallen, resource_->remote_stream(nodes[i].dev_num, start_index)));
     nodes[i].key_bytes_len = keylen;
     nodes[i].val_bytes_len = vallen;
   }
@@ -157,10 +170,10 @@ void HeterComm<KeyType, ValType, GradType>::destroy_storage(int start_index,
   for (size_t i = 0; i < nodes.size(); ++i) {
     platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].dev_num));
 
-    allocator->DeviceFree(resource_->dev_id(nodes[i].dev_num),
-                          nodes[i].key_storage);
-    allocator->DeviceFree(resource_->dev_id(nodes[i].dev_num),
-                          nodes[i].val_storage);
+    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree(
+        resource_->dev_id(nodes[i].dev_num), nodes[i].key_storage));
+    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree(
+        resource_->dev_id(nodes[i].dev_num), nodes[i].val_storage));
   }
 #endif
 }
@@ -238,95 +251,132 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
 }
 
 template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::walk_to_src(int start_index,
-                                                        int num, int* h_left,
-                                                        int* h_right,
-                                                        ValType* src_val) {
+void HeterComm<KeyType, ValType, GradType>::walk_to_dest(
+    int start_index, int gpu_num, int* h_left, int* h_right, KeyType* src_key,
+    char* src_val, size_t val_size) {
+  int need_copy_val = 0;
+  if (src_val) {
+    need_copy_val = 1;
+  }
   std::queue<CopyTask> que;
+  for (int i = 0; i < gpu_num; i++) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    int size = path_[start_index][i].nodes_.size();
+    auto& node = path_[start_index][i].nodes_[0];
+    CopyTask t(&path_[start_index][i], 0);
+    que.push(t);
+    cudaMemcpyAsync(node.key_storage,
+                    reinterpret_cast<char*>(src_key + h_left[i]),
+                    node.key_bytes_len, cudaMemcpyDefault, node.in_stream);
+    if (need_copy_val) {
+      cudaMemcpyAsync(node.val_storage,
+                      src_val + uint64_t(h_left[i]) * uint64_t(val_size),
+                      node.val_bytes_len, cudaMemcpyDefault, node.in_stream);
+    }
+  }
+  while (!que.empty()) {
+    CopyTask& cur_task = que.front();
+    que.pop();
+    if (cur_task.path->nodes_[cur_task.step].sync) {
+      cudaStreamSynchronize(cur_task.path->nodes_[cur_task.step].in_stream);
+    }
+    if (cur_task.step != cur_task.path->nodes_.size() - 1) {
+      int cur_step = cur_task.step;
+      CopyTask c(cur_task.path, cur_step + 1);
+      que.push(c);
+      cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage,
+                      cur_task.path->nodes_[cur_step].key_storage,
+                      cur_task.path->nodes_[cur_step + 1].key_bytes_len,
+                      cudaMemcpyDefault,
+                      cur_task.path->nodes_[cur_step + 1].in_stream);
+      if (need_copy_val) {
+        cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage,
+                        cur_task.path->nodes_[cur_step].val_storage,
+                        cur_task.path->nodes_[cur_step + 1].val_bytes_len,
+                        cudaMemcpyDefault,
+                        cur_task.path->nodes_[cur_step + 1].in_stream);
+      }
+    }
+  }
+}
 
-  for (int i = 0; i < num; i++) {
+template <typename KeyType, typename ValType, typename GradType>
+void HeterComm<KeyType, ValType, GradType>::walk_to_src(
+    int start_index, int gpu_num, int* h_left, int* h_right, char* src_val,
+    size_t val_size) {
+  std::queue<CopyTask> que;
+  for (int i = 0; i < gpu_num; i++) {
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
     }
     int cur_step = path_[start_index][i].nodes_.size() - 1;
     auto& node = path_[start_index][i].nodes_[cur_step];
-
-    auto src_dev_id = resource_->dev_id(i);
-    auto src_place = DevPlace(src_dev_id);
-
     if (cur_step == 0) {
-      auto dst_dev_id = resource_->dev_id(start_index);
-      auto dst_place = DevPlace(dst_dev_id);
-      memory_copy(dst_place, reinterpret_cast<char*>(src_val + h_left[i]),
-                  src_place, node.val_storage, node.val_bytes_len,
-                  node.out_stream);
+      cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size,
+                      node.val_storage, node.val_bytes_len, cudaMemcpyDefault,
+                      node.out_stream);
     } else {
       CopyTask t(&path_[start_index][i], cur_step - 1);
       que.push(t);
-
-      auto dst_dev_id =
-          resource_->dev_id(path_[start_index][i].nodes_[cur_step - 1].dev_num);
-      auto dst_place = DevPlace(dst_dev_id);
-
-      memory_copy(dst_place,
-                  path_[start_index][i].nodes_[cur_step - 1].val_storage,
-                  src_place, node.val_storage,
-                  path_[start_index][i].nodes_[cur_step - 1].val_bytes_len,
-                  path_[start_index][i].nodes_[cur_step - 1].out_stream);
+      cudaMemcpyAsync(path_[start_index][i].nodes_[cur_step - 1].val_storage,
+                      node.val_storage,
+                      path_[start_index][i].nodes_[cur_step - 1].val_bytes_len,
+                      cudaMemcpyDefault,
+                      path_[start_index][i].nodes_[cur_step - 1].out_stream);
     }
   }
-
   while (!que.empty()) {
     CopyTask& cur_task = que.front();
     que.pop();
     int cur_step = cur_task.step;
     if (cur_task.path->nodes_[cur_step].sync) {
-      sync_stream(cur_task.path->nodes_[cur_step].out_stream);
+      cudaStreamSynchronize(cur_task.path->nodes_[cur_step].out_stream);
     }
-
-    auto src_dev_id =
-        resource_->dev_id(cur_task.path->nodes_[cur_step].dev_num);
-    auto src_place = DevPlace(src_dev_id);
-
     if (cur_step > 0) {
       CopyTask c(cur_task.path, cur_step - 1);
       que.push(c);
-
-      auto dst_dev_id =
-          resource_->dev_id(cur_task.path->nodes_[cur_step - 1].dev_num);
-      auto dst_place = DevPlace(dst_dev_id);
-
-      memory_copy(dst_place, cur_task.path->nodes_[cur_step - 1].val_storage,
-                  src_place, cur_task.path->nodes_[cur_step].val_storage,
-                  cur_task.path->nodes_[cur_step - 1].val_bytes_len,
-                  cur_task.path->nodes_[cur_step - 1].out_stream);
-
+      cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage,
+                      cur_task.path->nodes_[cur_step].val_storage,
+                      cur_task.path->nodes_[cur_step - 1].val_bytes_len,
+                      cudaMemcpyDefault,
+                      cur_task.path->nodes_[cur_step - 1].out_stream);
     } else if (cur_step == 0) {
       int end_index = cur_task.path->nodes_.back().dev_num;
-
-      auto dst_dev_id = resource_->dev_id(end_index);
-      auto dst_place = DevPlace(dst_dev_id);
-
-      memory_copy(dst_place,
-                  reinterpret_cast<char*>(src_val + h_left[end_index]),
-                  src_place, cur_task.path->nodes_[cur_step].val_storage,
-                  cur_task.path->nodes_[cur_step].val_bytes_len,
-                  cur_task.path->nodes_[cur_step].out_stream);
+      cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size,
+                      cur_task.path->nodes_[cur_step].val_storage,
+                      cur_task.path->nodes_[cur_step].val_bytes_len,
+                      cudaMemcpyDefault,
+                      cur_task.path->nodes_[cur_step].out_stream);
     }
   }
 }
 
 template <typename KeyType, typename ValType, typename GradType>
 HeterComm<KeyType, ValType, GradType>::~HeterComm() {
-  for (auto& table : tables_) {
-    delete table;
-    table = nullptr;
+  if (!multi_mf_dim_) {
+    for (auto& table : tables_) {
+      delete table;
+      table = nullptr;
+    }
+  } else {
+    for (auto& table : ptr_tables_) {
+      delete table;
+      table = nullptr;
+    }
+    for (auto& table : tables_) {
+      delete table;
+      table = nullptr;
+    }
   }
 }
 
 template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::show_one_table(int num) {
-  tables_[num]->show();
+void HeterComm<KeyType, ValType, GradType>::show_one_table(int gpu_num) {
+  if (!multi_mf_dim_) {
+    tables_[gpu_num]->show();
+  }
 }
 
 template <typename KeyType, typename ValType, typename GradType>
@@ -362,7 +412,7 @@ void HeterComm<KeyType, ValType, GradType>::set_embedx_sgd(
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::build_ps(
     int dev_num, KeyType* h_keys, ValType* h_vals, size_t len,
-    size_t chunk_size, int stream_num) {
+    size_t chunk_size, int stream_num, int offset) {
   if (len <= 0) {
     return;
   }
@@ -403,8 +453,8 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(
     memory_copy(
         dst_place, reinterpret_cast<char*>(d_val_bufs[cur_stream]->ptr()),
         src_place, h_vals + cur_len, sizeof(ValType) * tmp_len, cur_use_stream);
-
-    tables_[dev_num]->insert(
+    if (offset == -1) offset = dev_num;
+    tables_[offset]->insert(
         reinterpret_cast<KeyType*>(d_key_bufs[cur_stream]->ptr()),
         reinterpret_cast<ValType*>(d_val_bufs[cur_stream]->ptr()), tmp_len,
         cur_use_stream);
@@ -418,59 +468,179 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(
   }
 }
 
+template <typename KeyType, typename ValType, typename GradType>
+void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
+                                                     char* pool, size_t len,
+                                                     size_t feature_value_size,
+                                                     size_t chunk_size,
+                                                     int stream_num) {
+  if (len <= 0) {
+    return;
+  }
+  int dev_id = resource_->dev_id(num);
+
+  DevPlace place = DevPlace(dev_id);
+  AnyDeviceGuard guard(dev_id);
+
+  // use hbm pool
+  std::vector<memory::allocation::AllocationPtr> d_key_bufs;
+
+  ppStream streams[stream_num];  // NOLINT
+  for (int i = 0; i < stream_num; ++i) {
+    create_stream(&(streams[i]));
+    auto d_k_buf = memory::Alloc(place, chunk_size * sizeof(KeyType));
+    d_key_bufs.push_back(std::move(d_k_buf));
+  }
+
+  int cur_len = 0;
+  int cur_stream = 0;
+
+  while (cur_len < len) {
+    cur_stream = cur_stream % stream_num;
+    auto cur_use_stream = streams[cur_stream];
+#if defined(PADDLE_WITH_XPU_KP)
+    cur_use_stream = 0;
+#endif
+    int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size;
+
+    auto dst_place = place;
+    auto src_place = platform::CPUPlace();
+
+    memory_copy(
+        dst_place, reinterpret_cast<char*>(d_key_bufs[cur_stream]->ptr()),
+        src_place, h_keys + cur_len, sizeof(KeyType) * tmp_len, cur_use_stream);
+    ptr_tables_[num]->insert(
+        reinterpret_cast<KeyType*>(d_key_bufs[cur_stream]->ptr()), tmp_len,
+        pool, feature_value_size, cur_len, cur_use_stream);
+    cur_stream += 1;
+    cur_len += tmp_len;
+  }
+  for (int i = 0; i < stream_num; ++i) {
+    sync_stream(streams[i]);
+    destroy_stream(streams[i]);
+  }
+}
+
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::merge_grad(
     int dev_num, KeyType* d_keys, GradType* d_grads, size_t len,
     int& uniq_len) {  // NOLINT
-
   int dev_id = resource_->dev_id(dev_num);
   DevPlace place = DevPlace(dev_id);
   AnyDeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(dev_num, 0);
-
   size_t temp_storage_bytes;
-
   auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
   auto d_merge_grads = memory::Alloc(place, len * sizeof(GradType));
   GradType* d_merge_grads_ptr =
       reinterpret_cast<GradType*>(d_merge_grads->ptr());
-
   heter_comm_kernel_->sort_pairs(NULL, temp_storage_bytes, d_keys,
                                  d_merge_keys_ptr, d_grads, d_merge_grads_ptr,
                                  len, 0, 8 * sizeof(KeyType), stream, false);
-
   auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-
   heter_comm_kernel_->sort_pairs(
       d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr,
       d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false);
   temp_storage_bytes = 0;
-
   auto d_num_runs_out_mem = memory::Alloc(place, sizeof(int));
   int* d_num_runs_out = reinterpret_cast<int*>(d_num_runs_out_mem->ptr());
-
   heter_comm_kernel_->reduce_by_key(NULL, temp_storage_bytes, d_merge_keys_ptr,
                                     d_keys, d_merge_grads_ptr, d_grads,
                                     d_num_runs_out, len, stream, false);
-
   if (d_temp_storage->size() < temp_storage_bytes) {
     d_temp_storage = NULL;
     d_temp_storage = memory::Alloc(place, temp_storage_bytes);
   }
-
   heter_comm_kernel_->reduce_by_key(
       d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys,
       d_merge_grads_ptr, d_grads, d_num_runs_out, len, stream, false);
-
   auto dst_place = platform::CPUPlace();
   auto src_place = place;
   memory_copy(dst_place, &uniq_len, src_place, d_num_runs_out, sizeof(int),
               stream);
-
   sync_stream(stream);
 }
 
+template <typename KeyType, typename ValType, typename GradType>
+void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
+    int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
+    int& uniq_len) {
+  int dev_id = resource_->dev_id(gpu_num);
+  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(gpu_num, 0);
+
+  size_t temp_storage_bytes;
+
+  // VLOG(1) << "hetercomm merge_grad: max_mf_dim: " << max_mf_dim_;
+  size_t grad_value_size =
+      TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+
+  auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
+  KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
+
+  auto d_merge_grads = memory::Alloc(place, len * grad_value_size);
+  GradType* d_merge_grads_ptr =
+      reinterpret_cast<GradType*>(d_merge_grads->ptr());
+
+  auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1));
+  uint32_t* d_fea_num_info_ptr =
+      reinterpret_cast<uint32_t*>(d_fea_num_info->ptr());
+  uint32_t* d_index = (uint32_t*)&d_fea_num_info_ptr[len];
+  uint32_t* d_idx = (uint32_t*)&d_index[len];
+  int* d_merged_size = (int*)&d_idx[len];
+  int grid_size = (len - 1) / block_size_ + 1;
+  heter_comm_kernel_->fill_idx(d_idx, len, stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+      NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_idx, d_index, len,
+      0, 8 * sizeof(KeyType), stream));
+  void* d_buff = NULL;
+  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+      d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr,
+      d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  temp_storage_bytes = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
+      NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_fea_num_info_ptr,
+      d_merged_size, len, stream));
+  if (d_temp_storage->size() < temp_storage_bytes) {
+    d_temp_storage = NULL;
+    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
+      d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys,
+      d_fea_num_info_ptr, d_merged_size, len, stream));
+
+  cudaMemcpyAsync((void*)&uniq_len, d_merged_size, sizeof(int),
+                  cudaMemcpyDeviceToHost, stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+  assert(d_merged_size > 0);
+  uint32_t* d_offset = (uint32_t*)&d_index[len];
+  temp_storage_bytes = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
+      NULL, temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len,
+      stream));
+  if (d_temp_storage->size() < temp_storage_bytes) {
+    d_temp_storage = NULL;
+    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
+      d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset,
+      uniq_len, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  heter_comm_kernel_->merge_gradient(
+      d_offset, d_fea_num_info_ptr, d_index, (char*)d_grads,
+      (char*)d_merge_grads_ptr, uniq_len, grad_value_size, merger_, stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, d_merge_grads_ptr,
+                                             grad_value_size * uniq_len,
+                                             cudaMemcpyDeviceToDevice, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+}
+
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::split_input_to_shard(
     KeyType* d_keys, int* d_idx_ptr, size_t len, int* left, int* right,
@@ -529,8 +699,6 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
   AnyDeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(num, 0);
 
-  // int grid_size = (len - 1) / block_size_ + 1;
-
   int h_left[total_device];   // NOLINT
   int h_right[total_device];  // NOLINT
 
@@ -562,10 +730,11 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
 
   auto d_idx = memory::Alloc(place, len * sizeof(int));
   int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
-
+  size_t val_type_size =
+      TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1));
   auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
-  auto d_shard_vals = memory::Alloc(place, len * sizeof(ValType));
+  auto d_shard_vals = memory::Alloc(place, len * val_type_size);
   ValType* d_shard_vals_ptr = reinterpret_cast<ValType*>(d_shard_vals->ptr());
 
   split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num);
@@ -589,9 +758,8 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
       continue;
     }
     create_storage(num, i, shard_len * sizeof(KeyType),
-                   shard_len * sizeof(ValType));
+                   shard_len * val_type_size);
   }
-
   walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL);
 
   for (int i = 0; i < total_device; ++i) {
@@ -600,14 +768,11 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
     }
     auto& node = path_[num][i].nodes_.back();
     sync_stream(node.in_stream);
-
     AnyDeviceGuard guard(resource_->dev_id(i));
-
-    tables_[i]->rwlock_->RDLock();
-    tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
-                    reinterpret_cast<ValType*>(node.val_storage),
-                    h_right[i] - h_left[i] + 1,
-                    resource_->remote_stream(i, num));
+    ptr_tables_[i]->rwlock_->RDLock();
+    ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
+                        node.val_storage, h_right[i] - h_left[i] + 1,
+                        resource_->remote_stream(i, num));
   }
 
   for (int i = 0; i < total_device; ++i) {
@@ -615,21 +780,18 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
     if (h_left[i] == -1) {
       continue;
     }
-    tables_[i]->rwlock_->UNLock();
+    ptr_tables_[i]->rwlock_->UNLock();
   }
-
-  walk_to_src(num, total_device, h_left, h_right, d_shard_vals_ptr);
-
+  walk_to_src(num, total_device, h_left, h_right,
+              reinterpret_cast<char*>(d_shard_vals_ptr), val_type_size);
   for (int i = 0; i < total_device; ++i) {
     auto& node = path_[num][i].nodes_.front();
     sync_stream(node.out_stream);
   }
-
-  heter_comm_kernel_->fill_dvals(d_shard_vals_ptr, d_vals, d_idx_ptr, len,
-                                 stream);
+  heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, d_vals, d_idx_ptr, len,
+                                       val_type_size, stream);
 
   sync_stream(stream);
-
   for (int i = 0; i < total_device; ++i) {
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
@@ -653,6 +815,8 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
   int total_device = resource_->total_device();
   int dev_id = resource_->dev_id(dev_num);
 
+  size_t grad_value_size =
+      TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
   DevPlace place = DevPlace(dev_id);
   AnyDeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(dev_num, 0);
@@ -691,21 +855,33 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
 
   auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
-  auto d_shard_grads = memory::Alloc(place, len * sizeof(GradType));
-  GradType* d_shard_grads_ptr =
-      reinterpret_cast<GradType*>(d_shard_grads->ptr());
+
+  GradType* d_shard_grads_ptr;
+  if (!multi_mf_dim_) {
+    auto d_shard_grads = memory::Alloc(place, len * sizeof(GradType));
+    d_shard_grads_ptr = reinterpret_cast<GradType*>(d_shard_grads->ptr());
+  } else {
+    auto d_shard_grads = memory::Alloc(place, len * grad_value_size);
+    d_shard_grads_ptr = reinterpret_cast<GradType*>(d_shard_grads->ptr());
+  }
 
   int uniq_len = len;
-  merge_grad(dev_num, d_keys, d_grads, len, uniq_len);
+  dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len);
 
-  // int grid_size = (uniq_len - 1) / block_size_ + 1;
+  int grid_size = (uniq_len - 1) / block_size_ + 1;
 
   split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr,
                        dev_num);
 
-  heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys,
-                                       d_shard_grads_ptr, d_grads, d_idx_ptr,
-                                       uniq_len, stream);
+  if (!multi_mf_dim_) {
+    heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys,
+                                         d_shard_grads_ptr, d_grads, d_idx_ptr,
+                                         uniq_len, stream);
+  } else {
+    heter_comm_kernel_->dy_mf_fill_shard_grads(
+        d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr,
+        uniq_len, grad_value_size, stream);
+  }
 
   sync_stream(stream);
 
@@ -721,12 +897,22 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
     }
-    create_storage(dev_num, i, shard_len * sizeof(KeyType),
-                   shard_len * sizeof(GradType));
+    if (!multi_mf_dim_) {
+      create_storage(dev_num, i, shard_len * sizeof(KeyType),
+                     shard_len * sizeof(GradType));
+    } else {
+      create_storage(dev_num, i, shard_len * sizeof(KeyType),
+                     shard_len * grad_value_size);
+    }
   }
 
-  walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr,
-               d_shard_grads_ptr);
+  if (!multi_mf_dim_) {
+    walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr,
+                 d_shard_grads_ptr);
+  } else {
+    walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr,
+                 reinterpret_cast<char*>(d_shard_grads_ptr), grad_value_size);
+  }
 
   for (int i = 0; i < total_device; ++i) {
     if (h_left[i] == -1 || h_right[i] == -1) {
@@ -736,17 +922,28 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
     sync_stream(node.in_stream);
 
     AnyDeviceGuard guard(resource_->dev_id(i));
-    tables_[i]->rwlock_->WRLock();
-    tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
-                       reinterpret_cast<GradType*>(node.val_storage),
-                       h_right[i] - h_left[i] + 1, sgd,
-                       resource_->remote_stream(i, dev_num));
+    if (!multi_mf_dim_) {
+      tables_[i]->rwlock_->WRLock();
+      tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
+                         reinterpret_cast<GradType*>(node.val_storage),
+                         h_right[i] - h_left[i] + 1, sgd,
+                         resource_->remote_stream(i, dev_num));
+    } else {
+      ptr_tables_[i]->rwlock_->WRLock();
+      ptr_tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
+                             node.val_storage, h_right[i] - h_left[i] + 1, sgd,
+                             resource_->remote_stream(i, dev_num));
+    }
   }
 
   for (int i = 0; i < total_device; ++i) {
     sync_stream(resource_->remote_stream(i, dev_num));
     if (h_left[i] != -1) {
-      tables_[i]->rwlock_->UNLock();
+      if (!multi_mf_dim_) {
+        tables_[i]->rwlock_->UNLock();
+      } else {
+        ptr_tables_[i]->rwlock_->UNLock();
+      }
     }
   }
 
@@ -1078,11 +1275,13 @@ void HeterComm<KeyType, ValType, GradType>::end_pass() {
     tables_[index]->dump_to_cpu(dev_id, stream);
   };
 
-  for (int i = 0; i < total_device; ++i) {
-    threads.push_back(std::thread(dump_to_cpu_func, i));
-  }
-  for (auto& t : threads) {
-    t.join();
+  if (!multi_mf_dim_) {
+    for (int i = 0; i < total_device; ++i) {
+      threads.push_back(std::thread(dump_to_cpu_func, i));
+    }
+    for (auto& t : threads) {
+      t.join();
+    }
   }
 }
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
index bdeb696a92bcef..94d7929b2947d2 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
@@ -117,6 +117,53 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
   }
 }
 
+template <typename KeyType, typename GradType, typename T>
+__global__ void dy_mf_fill_shard_grads_kernel(
+    KeyType* d_shard_keys, KeyType* d_keys, GradType* d_shard_grads,
+    GradType* d_grads, T* idx, size_t len, size_t grad_value_size) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    d_shard_keys[i] = d_keys[idx[i]];
+    *(GradType*)((char*)d_shard_grads + i * grad_value_size) =
+        *(GradType*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size);
+  }
+}
+
+__global__ void merge_gradients_kernel(const uint32_t* offset,
+                                       const uint32_t* fea_num,
+                                       const uint32_t* index, const char* input,
+                                       char* output, int n,
+                                       size_t grad_value_size,
+                                       DynamicGradMerger& merger_) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i < n) {
+    uint32_t start = offset[i];
+    uint32_t num = fea_num[i];
+    int ori_index = index[start];
+    FeaturePushValue& out = *(FeaturePushValue*)(output + i * grad_value_size);
+    FeaturePushValue& in =
+        *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size);
+    merger_.update_one(out, in);
+    for (int j = 1; j < num; ++j) {
+      ori_index = index[start + j];
+      in = *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size);
+      merger_.merge_one(out, in);
+    }
+  }
+}
+
+template <typename ValType, typename T>
+__global__ void dy_mf_fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
+                                        T* idx, size_t len, size_t val_size) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    uint64_t new_offset = uint64_t(idx[i]) * val_size;
+    *(ValType*)((char*)d_vals + new_offset) =
+        *(ValType*)((char*)d_shard_vals + i * val_size);
+  }
+}
+
 // cuda implemention of  heter_comm_kernel.h
 template <typename T, typename StreamType>
 void HeterCommKernel::fill_idx(T* idx, long long len,
@@ -207,8 +254,42 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage,
       debug_synchronous));
 }
 
+template <typename KeyType, typename GradType, typename T, typename StreamType>
+void HeterCommKernel::dy_mf_fill_shard_grads(
+    KeyType* d_shard_keys, KeyType* d_keys, GradType* d_shard_grads,
+    GradType* d_grads, T* idx, long long len, size_t grad_value_size,
+    const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  dy_mf_fill_shard_grads_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_keys, d_keys, d_shard_grads, d_grads, idx, c_len,
+      grad_value_size);
+}
+
+template <typename StreamType>
+void HeterCommKernel::merge_gradient(
+    const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index,
+    const char* input, char* output, int n, size_t grad_value_size,
+    DynamicGradMerger& merger_, const StreamType& stream) {
+  int grid_size = (n - 1) / block_size_ + 1;
+  merge_gradients_kernel<<<grid_size, block_size_, 0, stream>>>(
+      offset, fea_num, index, input, output, n, grad_value_size, merger_);
+}
+
+template <typename ValType, typename T, typename StreamType>
+void HeterCommKernel::dy_mf_fill_dvals(ValType* d_shard_vals, ValType* d_vals,
+                                       T* idx, long long len, size_t val_size,
+                                       const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  dy_mf_fill_dvals_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_vals, d_vals, idx, c_len, val_size);
+}
+
 template void HeterCommKernel::fill_idx<int, cudaStream_t>(
     int* idx, long long len, const cudaStream_t& stream);
+template void HeterCommKernel::fill_idx<uint32_t, cudaStream_t>(
+    uint32_t* idx, long long len, const cudaStream_t& stream);
 
 template void HeterCommKernel::calc_shard_offset<int, cudaStream_t>(
     int* idx, int* left, int* right, long long len, int total_devs,
@@ -270,6 +351,23 @@ template void HeterCommKernel::reduce_by_key<
     paddle::framework::FeaturePushValue* d_aggregates_out, int* d_num_runs_out,
     int num_items, cudaStream_t stream, bool debug_synchronous);
 
+template void HeterCommKernel::dy_mf_fill_shard_grads<
+    unsigned long, paddle::framework::FeaturePushValue, int, cudaStream_t>(
+    unsigned long* d_shard_keys, unsigned long* d_keys,
+    paddle::framework::FeaturePushValue* d_shard_grads,
+    paddle::framework::FeaturePushValue* d_grads, int* idx, long long len,
+    size_t grad_value_size, const cudaStream_t& stream);
+
+template void HeterCommKernel::merge_gradient<cudaStream_t>(
+    const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index,
+    const char* input, char* output, int n, size_t grad_value_size,
+    DynamicGradMerger& merger_, const cudaStream_t& stream);
+
+template void HeterCommKernel::dy_mf_fill_dvals<paddle::framework::FeatureValue,
+                                                int, cudaStream_t>(
+    paddle::framework::FeatureValue* d_shard_vals,
+    paddle::framework::FeatureValue* d_vals, int* idx, long long len,
+    size_t val_size, const cudaStream_t& stream);
 #endif
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
index 9d2ee5d272c722..4f866ccda82017 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
@@ -27,6 +27,42 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+struct DynamicGradMerger {
+  template <typename T>
+  CUB_RUNTIME_FUNCTION __forceinline__ __device__ T
+  operator()(const T& a, const T& b) const {
+    T out;
+    out.slot = a.slot;
+    out.mf_dim = a.mf_dim;
+    out.show = a.show + b.show;
+    out.clk = a.clk + b.clk;
+    out.lr_g = a.lr_g + b.lr_g;
+
+    return out;
+  }
+
+  template <typename T>
+  __device__ __forceinline__ void update_one(T& output, const T& input) {
+    output.slot = input.slot;
+    output.show = input.show;
+    output.clk = input.clk;
+    output.mf_dim = input.mf_dim;
+    output.lr_g = input.lr_g;
+    for (int i = 0; i < output.mf_dim; ++i) {
+      output.mf_g[i] = input.mf_g[i];
+    }
+  }
+  template <typename T>
+  __device__ __forceinline__ void merge_one(T& output, const T& input) {
+    output.show += input.show;
+    output.clk += input.clk;
+    output.lr_g += input.lr_g;
+    for (int i = 0; i < input.mf_dim; ++i) {
+      output.mf_g[i] += input.mf_g[i];
+    }
+  }
+};
+
 class HeterCommKernel {
  public:
   HeterCommKernel() {}
@@ -80,6 +116,24 @@ class HeterCommKernel {
 
                      StreamType stream = NULL, bool debug_synchronous = false);
 
+  template <typename KeyType, typename GradType, typename T,
+            typename StreamType>
+  void dy_mf_fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys,
+                              GradType* d_shard_grads, GradType* d_grads,
+                              T* idx, long long len, size_t grad_value_size,
+                              const StreamType& stream);
+
+  template <typename StreamType>
+  void merge_gradient(const uint32_t* offset, const uint32_t* fea_num,
+                      const uint32_t* index, const char* input, char* output,
+                      int n, size_t grad_value_size, DynamicGradMerger& merger_,
+                      const StreamType& stream);
+
+  template <typename ValType, typename T, typename StreamType>
+  void dy_mf_fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx,
+                        long long len, size_t val_size,
+                        const StreamType& stream);
+
  private:
   int block_size_{256};
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index 66e06b13b046f4..43b84ee5d26fbe 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -44,6 +44,13 @@ void HeterPs::build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
   comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num);
 }
 
+void HeterPs::build_ps(int num, FeatureKey* h_keys, char* pool, size_t len,
+                       size_t feature_value_size, size_t chunk_size,
+                       int stream_num) {
+  comm_->build_ps(num, h_keys, pool, len, feature_value_size, chunk_size,
+                  stream_num);
+}
+
 int HeterPs::get_index_by_devid(int devid) {
   return comm_->get_index_by_devid(devid);
 }
@@ -72,6 +79,10 @@ void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
   comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size);
 }
 
+void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) {
+  comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim);
+}
+
 }  // end namespace framework
 }  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
index 70b88350f2720a..8449a4048b72f9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -37,11 +37,14 @@ class HeterPs : public HeterPsBase {
                    size_t len) override;
   void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals, size_t len,
                 size_t chunk_size, int stream_num) override;
-
+  void build_ps(int num, FeatureKey* h_keys, char* pool, size_t len,
+                size_t feature_value_size, size_t chunk_size,
+                int stream_num) override;
 #if defined(PADDLE_WITH_CUDA)
   void set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
                               const std::vector<ncclComm_t>& inter_comms,
                               int comm_size) override;
+  void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override;
 #endif
 
   void set_sparse_sgd(const OptimizerConfig& optimizer_config) override;
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
index 0727e2c2dbce1c..2c312e9d4d60aa 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -35,11 +35,15 @@ class HeterPsBase {
                            size_t len) = 0;
   virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
                         size_t len, size_t chunk_size, int stream_num) = 0;
+  virtual void build_ps(int num, FeatureKey* h_keys, char* pool, size_t len,
+                        size_t feature_value_size, size_t chunk_size,
+                        int stream_num) = 0;
   virtual int get_index_by_devid(int devid) = 0;
 #if defined(PADDLE_WITH_CUDA)
   virtual void set_nccl_comm_and_size(
       const std::vector<ncclComm_t>& inner_comms,
       const std::vector<ncclComm_t>& inter_comms, int comm_size) = 0;
+  virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0;
 #endif
   virtual void end_pass() = 0;
   virtual void show_one_table(int gpu_num) = 0;
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
index 17bc12a5af1a73..5717f44d400a55 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
@@ -107,6 +107,8 @@ class HeterPsResource {
   int get_index_by_devid(int devid);
   int dev_id(int num);
   void set_multi_mf(int multi_mf_dim, int max_mf_dim);
+  int multi_mf() { return multi_mf_dim_; }
+  int max_mf_dim() { return max_mf_dim_; }
 
   ppStream local_stream(int dev_num, int stream_num);
   ppStream remote_stream(int dev_num, int stream_num);
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
index 065d5e6d527fc0..4684b4a0bc155c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
@@ -125,20 +125,21 @@ class Optimizer {
       if (optimizer_config.mf_create_thresholds <=
           optimizer_config.nonclk_coeff * (ptr->show - ptr->clk) +
               optimizer_config.clk_coeff * ptr->clk) {
-        // ptr->mf_size = ptr->mf_dim + 1;
+        ptr->mf_size = ptr->mf_dim + 1;
 
-        ptr->mf_size = MF_DIM + 1;
+        // ptr->mf_size = MF_DIM + 1;
         ptr->mf[0] = 0;
         int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
         curandState state;
         curand_init(clock64(), tid_x, 0, &state);
-        for (int i = 0; i < MF_DIM; ++i) {
+        for (int i = 0; i < ptr->mf_dim; ++i) {
           ptr->mf[i + 1] =
               (curand_uniform(&state)) * optimizer_config.mf_initial_range;
         }
       }
     } else {
-      update_mf(optimizer_config, MF_DIM, &(ptr->mf[1]), ptr->mf[0], grad.mf_g,
+      update_mf(optimizer_config, ptr->mf_dim, &(ptr->mf[1]), ptr->mf[0],
+                grad.mf_g,
                 grad.show);  // for local test
     }
   }
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
index ff3cd9d2d046d1..afeaf0b5541e44 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
@@ -25,9 +25,6 @@
 
 using namespace paddle::framework;
 namespace platform = paddle::platform;
-// paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph
-// paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
-//     std::vector<int64_t> ids)
 
 std::string edges[] = {
     std::string("0\t1"), std::string("0\t9"), std::string("1\t2"),
@@ -109,13 +106,13 @@ TEST(TEST_FLEET, test_cpu_cache) {
       std::make_shared<HeterPsResource>(device_id_mapping);
   resource->enable_p2p();
   int use_nv = 1;
-  GpuPsGraphTable g(resource, use_nv);
+  GpuPsGraphTable g(resource, use_nv, 1, 2);
   g.init_cpu_table(table_proto);
-  g.cpu_graph_table->Load(node_file_name, "nuser");
-  g.cpu_graph_table->Load(node_file_name, "nitem");
+  g.cpu_graph_table_->Load(node_file_name, "nuser");
+  g.cpu_graph_table_->Load(node_file_name, "nitem");
   std::remove(node_file_name);
   std::vector<paddle::framework::GpuPsCommGraph> vec;
-  std::vector<int64_t> node_ids;
+  std::vector<uint64_t> node_ids;
   node_ids.push_back(37);
   node_ids.push_back(96);
   std::vector<std::vector<std::string>> node_feat(2,
@@ -123,38 +120,29 @@ TEST(TEST_FLEET, test_cpu_cache) {
   std::vector<std::string> feature_names;
   feature_names.push_back(std::string("c"));
   feature_names.push_back(std::string("d"));
-  g.cpu_graph_table->get_node_feat(0, node_ids, feature_names, node_feat);
+  g.cpu_graph_table_->get_node_feat(0, node_ids, feature_names, node_feat);
   VLOG(0) << "get_node_feat: " << node_feat[0][0];
   VLOG(0) << "get_node_feat: " << node_feat[0][1];
   VLOG(0) << "get_node_feat: " << node_feat[1][0];
   VLOG(0) << "get_node_feat: " << node_feat[1][1];
   int n = 10;
-  std::vector<int64_t> ids0, ids1;
+  std::vector<uint64_t> ids0, ids1;
   for (int i = 0; i < n; i++) {
-    g.cpu_graph_table->add_comm_edge(0, i, (i + 1) % n);
-    g.cpu_graph_table->add_comm_edge(0, i, (i - 1 + n) % n);
+    g.cpu_graph_table_->add_comm_edge(0, i, (i + 1) % n);
+    g.cpu_graph_table_->add_comm_edge(0, i, (i - 1 + n) % n);
     if (i % 2 == 0) ids0.push_back(i);
   }
-  g.cpu_graph_table->build_sampler(0);
+  g.cpu_graph_table_->build_sampler(0);
   ids1.push_back(5);
   ids1.push_back(7);
-  vec.push_back(g.cpu_graph_table->make_gpu_ps_graph(0, ids0));
-  vec.push_back(g.cpu_graph_table->make_gpu_ps_graph(0, ids1));
+  vec.push_back(g.cpu_graph_table_->make_gpu_ps_graph(0, ids0));
+  vec.push_back(g.cpu_graph_table_->make_gpu_ps_graph(0, ids1));
   vec[0].display_on_cpu();
   vec[1].display_on_cpu();
   // g.build_graph_from_cpu(vec);
-  g.build_graph_on_single_gpu(vec[0], 0);
-  g.build_graph_on_single_gpu(vec[1], 1);
-  int64_t cpu_key[3] = {0, 1, 2};
-  /*
-  std::vector<std::shared_ptr<char>> buffers(3);
-  std::vector<int> actual_sizes(3,0);
-  g.cpu_graph_table->random_sample_neighbors(cpu_key,2,buffers,actual_sizes,false);
-  for(int i = 0;i < 3;i++){
-    VLOG(0)<<"sample from cpu key->"<<cpu_key[i]<<" actual sample size =
-  "<<actual_sizes[i]/sizeof(int64_t);
-  }
-  */
+  g.build_graph_on_single_gpu(vec[0], 0, 0);
+  g.build_graph_on_single_gpu(vec[1], 1, 0);
+  uint64_t cpu_key[3] = {0, 1, 2};
   void *key;
   int device_len = 2;
   for (int i = 0; i < 2; i++) {
@@ -166,7 +154,7 @@ TEST(TEST_FLEET, test_cpu_cache) {
     int step = 2;
     int cur = 0;
     while (true) {
-      auto node_query_res = g.query_node_list(i, cur, step);
+      auto node_query_res = g.query_node_list(i, 0, cur, step);
       node_query_res.display();
       if (node_query_res.get_len() == 0) {
         VLOG(0) << "no more ids,break";
@@ -174,20 +162,20 @@ TEST(TEST_FLEET, test_cpu_cache) {
       }
       cur += node_query_res.get_len();
       NeighborSampleQuery query;
-      query.initialize(i, node_query_res.get_val(), 1,
+      query.initialize(i, 0, node_query_res.get_val(), 1,
                        node_query_res.get_len());
       query.display();
       auto c = g.graph_neighbor_sample_v3(query, false);
       c.display();
     }
   }
-  g.cpu_graph_table->set_search_level(2);
-  // g.cpu_graph_table->Load_to_ssd(edge_file_name,"e>u2u");
-  g.cpu_graph_table->Load(edge_file_name, "e>u2u");
-  g.cpu_graph_table->make_partitions(0, 64, 2);
+  g.cpu_graph_table_->clear_graph(0);
+  g.cpu_graph_table_->set_search_level(2);
+  g.cpu_graph_table_->Load(edge_file_name, "e>u2u");
+  g.cpu_graph_table_->make_partitions(0, 64, 2);
   int index = 0;
-  while (g.cpu_graph_table->load_next_partition(0) != -1) {
-    auto all_ids = g.cpu_graph_table->get_all_id(0, 0, device_len);
+  while (g.cpu_graph_table_->load_next_partition(0) != -1) {
+    auto all_ids = g.cpu_graph_table_->get_all_id(0, 0, device_len);
     for (auto x : all_ids) {
       for (auto y : x) {
         VLOG(0) << "part " << index << " " << y;
@@ -195,19 +183,19 @@ TEST(TEST_FLEET, test_cpu_cache) {
     }
     for (int i = 0; i < all_ids.size(); i++) {
       GpuPsCommGraph sub_graph =
-          g.cpu_graph_table->make_gpu_ps_graph(0, all_ids[i]);
-      g.build_graph_on_single_gpu(sub_graph, i);
+          g.cpu_graph_table_->make_gpu_ps_graph(0, all_ids[i]);
+      g.build_graph_on_single_gpu(sub_graph, i, 0);
       VLOG(2) << "sub graph on gpu " << i << " is built";
     }
     VLOG(0) << "start to iterate gpu graph node";
-    g.cpu_graph_table->make_complementary_graph(0, 64);
+    g.cpu_graph_table_->make_complementary_graph(0, 64);
     for (int i = 0; i < 2; i++) {
       // platform::CUDADeviceGuard guard(i);
       LOG(0) << "query on card " << i;
       int step = 2;
       int cur = 0;
       while (true) {
-        auto node_query_res = g.query_node_list(i, cur, step);
+        auto node_query_res = g.query_node_list(i, 0, cur, step);
         node_query_res.display();
         if (node_query_res.get_len() == 0) {
           VLOG(0) << "no more ids,break";
@@ -215,23 +203,23 @@ TEST(TEST_FLEET, test_cpu_cache) {
         }
         cur += node_query_res.get_len();
         NeighborSampleQuery query, q1;
-        query.initialize(i, node_query_res.get_val(), 4,
+        query.initialize(i, 0, node_query_res.get_val(), 4,
                          node_query_res.get_len());
         query.display();
         auto c = g.graph_neighbor_sample_v3(query, true);
         c.display();
         platform::CUDADeviceGuard guard(i);
-        int64_t *key;
+        uint64_t *key;
         VLOG(0) << "sample key 1 globally";
-        g.cpu_graph_table->set_search_level(2);
-        cudaMalloc((void **)&key, sizeof(int64_t));
-        int64_t t_key = 1;
-        cudaMemcpy(key, &t_key, sizeof(int64_t), cudaMemcpyHostToDevice);
-        q1.initialize(i, (int64_t)key, 2, 1);
+        g.cpu_graph_table_->set_search_level(2);
+        cudaMalloc((void **)&key, sizeof(uint64_t));
+        uint64_t t_key = 1;
+        cudaMemcpy(key, &t_key, sizeof(uint64_t), cudaMemcpyHostToDevice);
+        q1.initialize(i, 0, (uint64_t)key, 2, 1);
         auto d = g.graph_neighbor_sample_v3(q1, true);
         d.display();
         cudaFree(key);
-        g.cpu_graph_table->set_search_level(1);
+        g.cpu_graph_table_->set_search_level(1);
       }
     }
     index++;
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index f512fcc7b9fdbe..cf9fb14bb9b9cd 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -28,12 +28,16 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HETERPS
 
+#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
+
 #include <algorithm>
 #include <deque>
 
-#include "paddle/fluid/framework/data_set.h"
-#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/platform/timer.h"
+#if defined(PADDLE_WITH_PSCORE)
+#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
+#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -107,29 +111,16 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   platform::Timer timeline;
   timeline.Start();
   int device_num = heter_devices_.size();
-  if (!multi_mf_dim_) {
-    gpu_task->init(thread_keys_shard_num_, device_num);
-  } else {
-    gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
-  }
-  auto& local_keys = gpu_task->feature_keys_;
-  auto& local_ptr = gpu_task->value_ptr_;
+  gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
 
   std::vector<std::thread> threads;
-
   // data should be in input channel
-  if (!multi_mf_dim_) {
-    thread_keys_.resize(thread_keys_thread_num_);
-    for (int i = 0; i < thread_keys_thread_num_; i++) {
-      thread_keys_[i].resize(thread_keys_shard_num_);
-    }
-  } else {
-    thread_dim_keys_.resize(thread_keys_thread_num_);
-    for (int i = 0; i < thread_keys_thread_num_; i++) {
-      thread_dim_keys_[i].resize(thread_keys_shard_num_);
-      for (int j = 0; j < thread_keys_shard_num_; j++) {
-        thread_dim_keys_[i][j].resize(multi_mf_dim_);
-      }
+
+  thread_dim_keys_.resize(thread_keys_thread_num_);
+  for (int i = 0; i < thread_keys_thread_num_; i++) {
+    thread_dim_keys_[i].resize(thread_keys_shard_num_);
+    for (int j = 0; j < thread_keys_shard_num_; j++) {
+      thread_dim_keys_[i][j].resize(multi_mf_dim_);
     }
   }
 
@@ -140,68 +131,128 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
 
   std::string data_set_name = std::string(typeid(*dataset_).name());
 
-  if (data_set_name.find("SlotRecordDataset") != std::string::npos) {
-    VLOG(0) << "ps_gpu_wrapper use SlotRecordDataset";
+  VLOG(0) << "gpu_graph_mode_:" << gpu_graph_mode_;
+  if (!gpu_graph_mode_) {
+    if (data_set_name.find("SlotRecordDataset") != std::string::npos) {
+      VLOG(0) << "ps_gpu_wrapper use SlotRecordDataset";
+      SlotRecordDataset* dataset = dynamic_cast<SlotRecordDataset*>(dataset_);
+      auto input_channel = dataset->GetInputChannel();
+      VLOG(0) << "psgpu wrapperinputslotchannle size: "
+              << input_channel->Size();
+      const std::deque<SlotRecord>& vec_data = input_channel->GetData();
+      total_len = vec_data.size();
+      len_per_thread = total_len / thread_keys_thread_num_;
+      remain = total_len % thread_keys_thread_num_;
+      VLOG(0) << "total len: " << total_len;
+      auto gen_dynamic_mf_func = [this](
+          const std::deque<SlotRecord>& total_data, int begin_index,
+          int end_index, int i) {
+        for (auto iter = total_data.begin() + begin_index;
+             iter != total_data.begin() + end_index; iter++) {
+          const auto& ins = *iter;
+          const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
+          const auto& slot_offset = ins->slot_uint64_feasigns_.slot_offsets;
+          for (size_t slot_idx = 0; slot_idx < slot_offset_vector_.size();
+               slot_idx++) {
+            for (size_t j = slot_offset[slot_offset_vector_[slot_idx]];
+                 j < slot_offset[slot_offset_vector_[slot_idx] + 1]; j++) {
+              int shard_id = feasign_v[j] % thread_keys_shard_num_;
+              int dim_id = slot_index_vec_[slot_idx];
+              if (feasign_v[j] != 0) {
+                this->thread_dim_keys_[i][shard_id][dim_id].insert(
+                    feasign_v[j]);
+              }
+            }
+          }
+        }
+      };
+      for (int i = 0; i < thread_keys_thread_num_; i++) {
+        threads.push_back(
+            std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin,
+                        begin + len_per_thread + (i < remain ? 1 : 0), i));
+
+        begin += len_per_thread + (i < remain ? 1 : 0);
+      }
+      for (std::thread& t : threads) {
+        t.join();
+      }
+      timeline.Pause();
+      VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec()
+              << " seconds.";
+    } else {
+      CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos);
+      VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset";
+      MultiSlotDataset* dataset = dynamic_cast<MultiSlotDataset*>(dataset_);
+      auto input_channel = dataset->GetInputChannel();
+
+      const std::deque<Record>& vec_data = input_channel->GetData();
+      total_len = vec_data.size();
+      len_per_thread = total_len / thread_keys_thread_num_;
+      remain = total_len % thread_keys_thread_num_;
+      auto gen_func = [this](const std::deque<Record>& total_data,
+                             int begin_index, int end_index, int i) {
+        for (auto iter = total_data.begin() + begin_index;
+             iter != total_data.begin() + end_index; iter++) {
+          const auto& ins = *iter;
+          const auto& feasign_v = ins.uint64_feasigns_;
+          for (const auto feasign : feasign_v) {
+            uint64_t cur_key = feasign.sign().uint64_feasign_;
+            int shard_id = cur_key % thread_keys_shard_num_;
+            this->thread_keys_[i][shard_id].insert(cur_key);
+          }
+        }
+      };
+      for (int i = 0; i < thread_keys_thread_num_; i++) {
+        threads.push_back(
+            std::thread(gen_func, std::ref(vec_data), begin,
+                        begin + len_per_thread + (i < remain ? 1 : 0), i));
+        begin += len_per_thread + (i < remain ? 1 : 0);
+      }
+      for (std::thread& t : threads) {
+        t.join();
+      }
+      timeline.Pause();
+      VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec()
+              << " seconds.";
+    }
+  } else {
+    VLOG(0) << "PreBuild in GpuGraph mode";
     SlotRecordDataset* dataset = dynamic_cast<SlotRecordDataset*>(dataset_);
-    auto input_channel = dataset->GetInputChannel();
-    VLOG(0) << "yxf::buildtask::inputslotchannle size: "
-            << input_channel->Size();
-    const std::deque<SlotRecord>& vec_data = input_channel->GetData();
+    const std::vector<uint64_t>& vec_data = dataset->GetGpuGraphTotalKeys();
     total_len = vec_data.size();
     len_per_thread = total_len / thread_keys_thread_num_;
+    VLOG(0) << "GpuGraphTotalKeys: " << total_len;
     remain = total_len % thread_keys_thread_num_;
-    VLOG(0) << "total len: " << total_len;
-    auto gen_func = [this](const std::deque<SlotRecord>& total_data,
-                           int begin_index, int end_index, int i) {
-      for (auto iter = total_data.begin() + begin_index;
-           iter != total_data.begin() + end_index; iter++) {
-        const auto& ins = *iter;
-        const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
-        for (const auto feasign : feasign_v) {
-          int shard_id = feasign % thread_keys_shard_num_;
-          this->thread_keys_[i][shard_id].insert(feasign);
-        }
-      }
-    };
-    auto gen_dynamic_mf_func = [this](const std::deque<SlotRecord>& total_data,
+    auto gen_graph_data_func = [this](const std::vector<uint64_t>& total_data,
                                       int begin_index, int end_index, int i) {
       for (auto iter = total_data.begin() + begin_index;
            iter != total_data.begin() + end_index; iter++) {
-        const auto& ins = *iter;
-        const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
-        const auto& slot_offset = ins->slot_uint64_feasigns_.slot_offsets;
-        for (size_t slot_idx = 0; slot_idx < slot_offset_vector_.size();
-             slot_idx++) {
-          for (size_t j = slot_offset[slot_offset_vector_[slot_idx]];
-               j < slot_offset[slot_offset_vector_[slot_idx] + 1]; j++) {
-            int shard_id = feasign_v[j] % thread_keys_shard_num_;
-            int dim_id = slot_index_vec_[slot_idx];
-            this->thread_dim_keys_[i][shard_id][dim_id].insert(feasign_v[j]);
-          }
-        }
+        uint64_t cur_key = *iter;
+        int shard_id = cur_key % thread_keys_shard_num_;
+        this->thread_keys_[i][shard_id].insert(cur_key);
       }
-      /*
+    };
+    auto gen_graph_dynamic_mf_func = [this](
+        const std::vector<uint64_t>& total_data, int begin_index, int end_index,
+        int i) {
       for (auto iter = total_data.begin() + begin_index;
            iter != total_data.begin() + end_index; iter++) {
-        const auto& ins = *iter;
-        const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
-        for (const auto feasign : feasign_v) {
-          int shard_id = feasign % thread_keys_shard_num_;
-          this->thread_dim_keys_[i][shard_id][0].insert(feasign);
-        }
+        uint64_t cur_key = *iter;
+        int shard_id = cur_key % thread_keys_shard_num_;
+        // int dim_id = slot_index_vec_[slot_idx];
+        this->thread_dim_keys_[i][shard_id][0].insert(cur_key);
       }
-      */
     };
     for (int i = 0; i < thread_keys_thread_num_; i++) {
       if (!multi_mf_dim_) {
-        VLOG(0) << "yxf::psgpu wrapper genfunc";
+        VLOG(0) << "psgpu graph wrapper genfunc";
         threads.push_back(
-            std::thread(gen_func, std::ref(vec_data), begin,
+            std::thread(gen_graph_data_func, std::ref(vec_data), begin,
                         begin + len_per_thread + (i < remain ? 1 : 0), i));
       } else {
-        VLOG(0) << "yxf::psgpu wrapper genfunc with dynamic mf";
+        VLOG(0) << "psgpu graph wrapper genfunc with dynamic mf";
         threads.push_back(
-            std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin,
+            std::thread(gen_graph_dynamic_mf_func, std::ref(vec_data), begin,
                         begin + len_per_thread + (i < remain ? 1 : 0), i));
       }
       begin += len_per_thread + (i < remain ? 1 : 0);
@@ -209,54 +260,12 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
     for (std::thread& t : threads) {
       t.join();
     }
-    timeline.Pause();
-    VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
-  } else {
-    CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos);
-    VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset";
-    MultiSlotDataset* dataset = dynamic_cast<MultiSlotDataset*>(dataset_);
-    auto input_channel = dataset->GetInputChannel();
-
-    const std::deque<Record>& vec_data = input_channel->GetData();
-    total_len = vec_data.size();
-    len_per_thread = total_len / thread_keys_thread_num_;
-    remain = total_len % thread_keys_thread_num_;
-    auto gen_func = [this](const std::deque<Record>& total_data,
-                           int begin_index, int end_index, int i) {
-      for (auto iter = total_data.begin() + begin_index;
-           iter != total_data.begin() + end_index; iter++) {
-        const auto& ins = *iter;
-        const auto& feasign_v = ins.uint64_feasigns_;
-        for (const auto feasign : feasign_v) {
-          uint64_t cur_key = feasign.sign().uint64_feasign_;
-          int shard_id = cur_key % thread_keys_shard_num_;
-          this->thread_keys_[i][shard_id].insert(cur_key);
-        }
-      }
-    };
-    for (int i = 0; i < thread_keys_thread_num_; i++) {
-      threads.push_back(
-          std::thread(gen_func, std::ref(vec_data), begin,
-                      begin + len_per_thread + (i < remain ? 1 : 0), i));
-      begin += len_per_thread + (i < remain ? 1 : 0);
-    }
-    for (std::thread& t : threads) {
-      t.join();
-    }
-    timeline.Pause();
-    VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
   }
 
   timeline.Start();
 
   threads.clear();
   // merge thread_keys to shard_keys
-  auto merge_ins_func = [this, gpu_task](int shard_num) {
-    for (int i = 0; i < thread_keys_thread_num_; ++i) {
-      gpu_task->batch_add_keys(shard_num, thread_keys_[i][shard_num]);
-      thread_keys_[i][shard_num].clear();
-    }
-  };
   auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) {
     for (int i = 0; i < thread_keys_thread_num_; ++i) {
       gpu_task->batch_add_keys(shard_num, dim_id,
@@ -264,19 +273,9 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       thread_dim_keys_[i][shard_num][dim_id].clear();
     }
   };
-  // for (size_t i = 0; i < thread_keys_.size(); i++) {
-  //  gpu_task->batch_add_keys(thread_keys_[i]);
-  //  for (int j = 0; j < thread_keys_thread_num_; j++) {
-  //    thread_keys_[i][j].clear();
-  //  }
-  //}
   for (int i = 0; i < thread_keys_shard_num_; ++i) {
-    if (!multi_mf_dim_) {
-      threads.push_back(std::thread(merge_ins_func, i));
-    } else {
-      for (int j = 0; j < multi_mf_dim_; j++) {
-        threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j));
-      }
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j));
     }
   }
   for (auto& t : threads) {
@@ -291,20 +290,15 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   timeline.Pause();
 
   VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
-
-  if (!multi_mf_dim_) {
-    for (int i = 0; i < thread_keys_shard_num_; i++) {
-      VLOG(0) << "GpuPs shard: " << i << " key len: " << local_keys[i].size();
-      local_ptr[i].resize(local_keys[i].size());
-    }
-  } else {
-    for (int i = 0; i < thread_keys_shard_num_; i++) {
-      for (int j = 0; j < multi_mf_dim_; j++) {
-        VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j]
-                << " key len: " << gpu_task->feature_dim_keys_[i][j].size();
-        gpu_task->value_dim_ptr_[i][j].resize(
-            gpu_task->feature_dim_keys_[i][j].size());
+  for (int i = 0; i < thread_keys_shard_num_; i++) {
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      if (i == 0 && j == multi_mf_dim_ - 1) {
+        gpu_task->feature_dim_keys_[i][j].push_back(0);
       }
+      VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j]
+              << " key len: " << gpu_task->feature_dim_keys_[i][j].size();
+      gpu_task->value_dim_ptr_[i][j].resize(
+          gpu_task->feature_dim_keys_[i][j].size());
     }
   }
 }
@@ -324,12 +318,12 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   auto& device_dim_keys = gpu_task->device_dim_keys_;
   auto& device_dim_ptr = gpu_task->device_dim_ptr_;
   auto& device_dim_mutex = gpu_task->dim_mutex_;
-  if (multi_mf_dim_) {
-    for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
-      device_dim_keys[dev].resize(multi_mf_dim_);
-      device_dim_ptr[dev].resize(multi_mf_dim_);
-    }
+
+  for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
+    device_dim_keys[dev].resize(multi_mf_dim_);
+    device_dim_ptr[dev].resize(multi_mf_dim_);
   }
+
   // auto& device_mutex = gpu_task->mutex_;
 
   std::vector<std::thread> threads(thread_keys_shard_num_);
@@ -353,18 +347,17 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
 #endif
 
   timeline.Start();
-  auto ptl_func = [this, &local_keys, &local_ptr, &fleet_ptr](int i) {
-    size_t key_size = local_keys[i].size();
+
+  auto ptl_dynamic_mf_func = [this, &local_dim_keys, &local_dim_ptr,
+                              &fleet_ptr](int i, int j) {
+    size_t key_size = local_dim_keys[i][j].size();
     int32_t status = -1;
-#ifdef PADDLE_WITH_PSLIB
-    // auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
-    //    reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
-    //    local_keys[i].data(), key_size);
     int32_t cnt = 0;
+#ifdef PADDLE_WITH_PSLIB
     while (true) {
       auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
-          i, reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
-          local_keys[i].data(), key_size);
+          i, reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
+          this->table_id_, local_dim_keys[i][j].data(), key_size);
       bool flag = true;
 
       tt.wait();
@@ -392,11 +385,10 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     }
 #endif
 #ifdef PADDLE_WITH_PSCORE
-    int32_t cnt = 0;
     while (true) {
       auto tt = fleet_ptr->worker_ptr_->PullSparsePtr(
-          reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
-          local_keys[i].data(), key_size);
+          reinterpret_cast<char**>(local_dim_ptr[i][j].data()), this->table_id_,
+          local_dim_keys[i][j].data(), key_size);
       bool flag = true;
 
       tt.wait();
@@ -423,51 +415,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
       }
     }
 #endif
-    if (status != 0) {
-      LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
-      sleep(300);
-      exit(-1);
-    } else {
-      VLOG(3) << "FleetWrapper Pull sparse to local done with table size: "
-              << local_keys[i].size();
-    }
-  };
-
-  auto ptl_dynamic_mf_func = [this, &local_dim_keys, &local_dim_ptr,
-                              &fleet_ptr](int i, int j) {
-#ifdef PADDLE_WITH_PSLIB
-    size_t key_size = local_dim_keys[i][j].size();
-    int32_t status = -1;
-    int32_t cnt = 0;
-    while (true) {
-      auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
-          i, reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
-          this->table_id_, local_dim_keys[i][j].data(), key_size);
-      bool flag = true;
-
-      tt.wait();
-
-      try {
-        status = tt.get();
-      } catch (const std::future_error& e) {
-        VLOG(0) << "Caught a future_error with code" << e.code()
-                << ", Message:" << e.what();
-      }
-      if (status != 0) {
-        VLOG(0) << "fleet pull sparse failed, status[" << status << "]";
-        sleep(sleep_seconds_before_fail_exit_);
-        flag = false;
-        cnt++;
-      }
-      if (cnt > 3) {
-        VLOG(0) << "fleet pull sparse failed, retry 3 times";
-        exit(-1);
-      }
-
-      if (flag) {
-        break;
-      }
-    }
     if (status != 0) {
       LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
       sleep(300);
@@ -476,23 +423,19 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
       VLOG(0) << "FleetWrapper Pull sparse to local done with table size: "
               << local_dim_keys[i][j].size();
     }
-#endif
   };
-  if (!multi_mf_dim_) {
-    for (size_t i = 0; i < threads.size(); i++) {
-      threads[i] = std::thread(ptl_func, i);
-    }
-  } else {
-    threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
-    for (int i = 0; i < thread_keys_shard_num_; i++) {
-      for (int j = 0; j < multi_mf_dim_; j++) {
-        threads[i * multi_mf_dim_ + j] = std::thread(ptl_dynamic_mf_func, i, j);
-      }
+
+  threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
+  for (int i = 0; i < thread_keys_shard_num_; i++) {
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      task_futures.emplace_back(
+          pull_thread_pool_[i]->enqueue(ptl_dynamic_mf_func, i, j));
     }
   }
-  for (std::thread& t : threads) {
-    t.join();
+  for (auto& f : task_futures) {
+    f.wait();
   }
+  task_futures.clear();
   timeline.Pause();
   VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec()
           << " seconds.";
@@ -509,45 +452,40 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   std::vector<std::vector<std::pair<uint64_t, char*>>> pass_values;
 
   bool record_status = false;
-#ifdef PADDLE_WITH_PSLIB
-  uint16_t pass_id = 0;
-  if (multi_node_) {
-    record_status = fleet_ptr->pslib_ptr_->_worker_ptr->take_sparse_record(
-        table_id_, pass_id, pass_values);
-  }
-#endif
   auto& device_task_keys = gpu_task->device_task_keys_;
   auto& device_task_ptrs = gpu_task->device_task_ptr_;
-  auto build_dynamic_mf_func = [this, device_num, &local_dim_keys,
-                                &local_dim_ptr, &device_dim_keys,
-                                &device_dim_ptr,
-                                &device_dim_mutex](int i, int j) {
-#ifdef PADDLE_WITH_PSLIB
+  auto build_pull_dynamic_mf_func = [this, device_num, &local_dim_keys,
+                                     &local_dim_ptr, &device_dim_keys,
+                                     &device_dim_ptr,
+                                     &device_dim_mutex](int i, int j) {
     std::vector<std::vector<FeatureKey>> task_keys(device_num);
+#ifdef PADDLE_WITH_PSLIB
     std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> task_ptrs(
         device_num);
+#endif
+
+#ifdef PADDLE_WITH_PSCORE
+    std::vector<std::vector<paddle::distributed::FixedFeatureValue*>> task_ptrs(
+        device_num);
+#endif
     for (size_t k = 0; k < local_dim_keys[i][j].size(); k++) {
       int shard = local_dim_keys[i][j][k] % device_num;
       task_keys[shard].push_back(local_dim_keys[i][j][k]);
       task_ptrs[shard].push_back(local_dim_ptr[i][j][k]);
     }
+    // allocate local keys to devices
     for (int dev = 0; dev < device_num; dev++) {
-      for (int dim = 0; dim < multi_mf_dim_; dim++) {
-        device_dim_mutex[dev][dim]->lock();
-
-        int len = task_keys[dev].size();
-        int cur = device_dim_keys[dev][dim].size();
-        device_dim_keys[dev][dim].resize(device_dim_keys[dev][dim].size() +
-                                         len);
-        device_dim_ptr[dev][dim].resize(device_dim_ptr[dev][dim].size() + len);
-        for (int k = 0; k < len; ++k) {
-          device_dim_keys[dev][dim][cur + k] = task_keys[dev][k];
-          device_dim_ptr[dev][dim][cur + k] = task_ptrs[dev][k];
-        }
-        device_dim_mutex[dev][dim]->unlock();
+      device_dim_mutex[dev][j]->lock();
+      int len = task_keys[dev].size();
+      int cur = device_dim_keys[dev][j].size();
+      device_dim_keys[dev][j].resize(device_dim_keys[dev][j].size() + len);
+      device_dim_ptr[dev][j].resize(device_dim_ptr[dev][j].size() + len);
+      for (int k = 0; k < len; ++k) {
+        device_dim_keys[dev][j][cur + k] = task_keys[dev][k];
+        device_dim_ptr[dev][j][cur + k] = task_ptrs[dev][k];
       }
+      device_dim_mutex[dev][j]->unlock();
     }
-#endif
   };
   auto build_func = [device_num, record_status, &pass_values, &local_keys,
                      &local_ptr, &device_task_keys, &device_task_ptrs](int i) {
@@ -697,7 +635,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     for (int i = 0; i < thread_keys_shard_num_; i++) {
       for (int j = 0; j < multi_mf_dim_; j++) {
         threads[i * multi_mf_dim_ + j] =
-            std::thread(build_dynamic_mf_func, i, j);
+            std::thread(build_pull_dynamic_mf_func, i, j);
       }
     }
     for (std::thread& t : threads) {
@@ -727,22 +665,19 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
 
   std::vector<size_t> feature_keys_count(device_num);
   size_t size_max = 0;
-  if (!multi_mf_dim_) {
-    for (int i = 0; i < device_num; i++) {
-      feature_keys_count[i] = gpu_task->device_keys_[i].size();
-      VLOG(0) << i << " card contains feasign nums: " << feature_keys_count[i];
-      size_max = std::max(size_max, feature_keys_count[i]);
-    }
-  } else {
-    for (int i = 0; i < device_num; i++) {
-      for (int j = 0; j < multi_mf_dim_; j++) {
-        feature_keys_count[i] += gpu_task->device_dim_ptr_[i][j].size();
-      }
-      VLOG(0) << i << " card with dynamic mf contains feasign nums: "
-              << feature_keys_count[i];
-      size_max = std::max(size_max, feature_keys_count[i]);
+
+  for (int i = 0; i < device_num; i++) {
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      feature_keys_count[i] += gpu_task->device_dim_ptr_[i][j].size();
+      VLOG(1) << i << " card with dynamic mf dim: " << index_dim_vec_[j]
+              << " dim index: " << j << " contains feasign nums: "
+              << gpu_task->device_dim_ptr_[i][j].size();
     }
+    VLOG(1) << i << " card with dynamic mf contains feasign nums total: "
+            << feature_keys_count[i];
+    size_max = std::max(size_max, feature_keys_count[i]);
   }
+
   if (HeterPs_) {
     delete HeterPs_;
     HeterPs_ = nullptr;
@@ -756,18 +691,95 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
 #ifdef PADDLE_WITH_CUDA
   HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_);
 #endif
-  auto build_func = [this, &gpu_task, &feature_keys_count](int i) {
-    VLOG(3) << "building table: " << i;
-    this->HeterPs_->build_ps(i, gpu_task->device_keys_[i].data(),
-                             gpu_task->device_values_[i].data(),
-                             feature_keys_count[i], 500000, 2);
-    // if (feature_keys_count[i] > 0) {
-    //   HeterPs_->show_one_table(i);
-    // }
+  auto build_dynamic_mf_func = [this, &gpu_task](int i, int j) {
+    this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
+    int mf_dim = this->index_dim_vec_[j];
+    VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim;
+    size_t feature_value_size =
+        TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
+    auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
+    auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j];
+    size_t len = device_dim_keys.size();
+    CHECK(len == device_dim_ptrs.size());
+    this->mem_pools_[i * this->multi_mf_dim_ + j] =
+        new MemoryPool(len, feature_value_size);
+    auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j];
+    for (size_t k = 0; k < len; k++) {
+      FeatureValue* val = (FeatureValue*)(mem_pool->mem_address(k));
+      float* ptr_val = device_dim_ptrs[k]->data();
+      size_t dim = device_dim_ptrs[k]->size();
+#ifdef PADDLE_WITH_PSLIB
+      val->delta_score =
+          ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                      DownpourCtrDymfFeatureValue::delta_score_index()];
+      val->show = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                              DownpourCtrDymfFeatureValue::show_index()];
+      val->clk = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                             DownpourCtrDymfFeatureValue::click_index()];
+      val->slot = int(ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                                  DownpourCtrDymfFeatureValue::slot_index()]);
+      val->lr = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                            DownpourCtrDymfFeatureValue::embed_w_index()];
+      val->lr_g2sum =
+          ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                      DownpourCtrDymfFeatureValue::embed_g2sum_index()];
+      // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
+      ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  mf_dim_index()] = float(mf_dim);
+      val->mf_dim = mf_dim;
+#endif
+#ifdef PADDLE_WITH_PSCORE
+      paddle::distributed::CtrDymfAccessor accessor;
+      val->delta_score =
+          ptr_val[accessor.common_feature_value.DeltaScoreIndex()];
+      val->show = ptr_val[accessor.common_feature_value.ShowIndex()];
+      val->clk = ptr_val[accessor.common_feature_value.ClickIndex()];
+      val->slot = int(ptr_val[accessor.common_feature_value.SlotIndex()]);
+      val->lr = ptr_val[accessor.common_feature_value.EmbedWIndex()];
+      val->lr_g2sum = ptr_val[accessor.common_feature_value.EmbedG2SumIndex()];
+
+      val->cpu_ptr = (uint64_t)(device_dim_ptrs[k]);
+
+      // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
+      ptr_val[accessor.common_feature_value.MfDimIndex()] = float(mf_dim);
+      val->mf_dim = mf_dim;
+#endif
+      if (dim > 8) {  // CpuPS alreay expand as mf_dim
+        val->mf_size = mf_dim + 1;
+        for (int x = 0; x < val->mf_dim + 1; x++) {
+          val->mf[x] = ptr_val[x + 8];
+        }
+      } else {
+        val->mf_size = 0;
+        for (int x = 0; x < val->mf_dim + 1; x++) {
+          val->mf[x] = 0;
+        }
+      }
+    }
+
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+
+    this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool);
+    auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
+
+    this->HeterPs_->build_ps(i, device_dim_keys.data(), cur_pool->mem(), len,
+                             feature_value_size, 500000, 2);
+
+    if (device_dim_keys.size() > 0) {
+      VLOG(0) << "show ptr table: " << i
+              << " table kv size: " << device_dim_keys.size()
+              << "dim: " << mf_dim << " len: " << len;
+      this->HeterPs_->show_one_table(i);
+    }
+    delete mem_pool;
   };
-  for (size_t i = 0; i < threads.size(); i++) {
-    threads[i] = std::thread(build_func, i);
+  threads.resize(device_num * multi_mf_dim_);
+  for (int i = 0; i < device_num; i++) {
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      threads[i + j * device_num] = std::thread(build_dynamic_mf_func, i, j);
+    }
   }
+
   for (std::thread& t : threads) {
     t.join();
   }
@@ -788,10 +800,12 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
   if (is_shuffle) {
     dataset_->LocalShuffle();
   }
-
+  InitSlotInfo();
   std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
   gpu_task->Reset();
+
   data_ready_channel_->Put(gpu_task);
+
   VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]";
 }
 
@@ -873,13 +887,105 @@ void PSGPUWrapper::EndPass() {
   timer.Start();
   size_t keysize_max = 0;
   // in case of feasign_num = 0, skip dump_to_cpu
+
   for (size_t i = 0; i < heter_devices_.size(); i++) {
-    keysize_max = std::max(keysize_max, current_task_->device_keys_[i].size());
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      keysize_max =
+          std::max(keysize_max, current_task_->device_dim_keys_[i][j].size());
+    }
+  }
+
+  auto dump_pool_to_cpu_func = [this](int i, int j) {
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i)));
+    auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
+    auto& device_keys = this->current_task_->device_dim_keys_[i][j];
+    size_t len = device_keys.size();
+    int mf_dim = this->index_dim_vec_[j];
+    VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim
+            << " key_len :" << len;
+    size_t feature_value_size =
+        TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
+
+    char* test_build_values = (char*)malloc(feature_value_size * len);
+    cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len,
+               cudaMemcpyDeviceToHost);
+
+    CHECK(len == hbm_pool->capacity());
+    uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
+    for (size_t index = 0; index < len; ++index) {
+      if (device_keys[index] == unuse_key) {
+        continue;
+      }
+      size_t offset = index * feature_value_size;
+      FeatureValue* gpu_val = (FeatureValue*)(test_build_values + offset);
+#ifdef PADDLE_WITH_PSLIB
+      auto* downpour_value =
+          (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr);
+      int downpour_value_size = downpour_value->size();
+      if (gpu_val->mf_size > 0 && downpour_value_size == 8) {
+        downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size);
+      }
+      float* cpu_val = downpour_value->data();
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  delta_score_index()] = gpu_val->delta_score;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  show_index()] = gpu_val->show;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  click_index()] = gpu_val->clk;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  embed_w_index()] = gpu_val->lr;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  embed_g2sum_index()] = gpu_val->lr_g2sum;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  slot_index()] = gpu_val->slot;
+#endif
+#ifdef PADDLE_WITH_PSCORE
+      auto* downpour_value =
+          (paddle::distributed::FixedFeatureValue*)(gpu_val->cpu_ptr);
+      int downpour_value_size = downpour_value->size();
+      if (gpu_val->mf_size > 0 && downpour_value_size == 8) {
+        downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size);
+      }
+      float* cpu_val = downpour_value->data();
+
+      paddle::distributed::CtrDymfAccessor accessor;
+      cpu_val[accessor.common_feature_value.DeltaScoreIndex()] =
+          gpu_val->delta_score;
+      cpu_val[accessor.common_feature_value.ShowIndex()] = gpu_val->show;
+      cpu_val[accessor.common_feature_value.ClickIndex()] = gpu_val->clk;
+      cpu_val[accessor.common_feature_value.EmbedWIndex()] = gpu_val->lr;
+      cpu_val[accessor.common_feature_value.EmbedG2SumIndex()] =
+          gpu_val->lr_g2sum;
+      cpu_val[accessor.common_feature_value.SlotIndex()] = gpu_val->slot;
+#endif
+      if (gpu_val->mf_size > 0) {
+        for (int x = 0; x < gpu_val->mf_dim + 1; x++) {
+          cpu_val[x + 8] = gpu_val->mf[x];
+        }
+      }
+    }
+    free(test_build_values);
+  };
+  if (multi_mf_dim_) {
+    VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_;
+    size_t device_num = heter_devices_.size();
+    std::vector<std::thread> threads(device_num * multi_mf_dim_);
+    for (size_t i = 0; i < device_num; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j);
+      }
+    }
+    for (std::thread& t : threads) {
+      t.join();
+    }
   }
   if (keysize_max != 0) {
     HeterPs_->end_pass();
   }
-
+  VLOG(0) << "HeterPs_->end_pass end";
+  for (size_t i = 0; i < hbm_pools_.size(); i++) {
+    delete hbm_pools_[i];
+  }
   gpu_task_pool_.Push(current_task_);
   current_task_ = nullptr;
   gpu_free_channel_->Put(current_task_);
@@ -936,8 +1042,6 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
     pull_gpups_timer.Start();
     HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu,
                           static_cast<int>(total_length));
-    // PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-    //                              "PullSparseGPU failed in GPUPS."));
     pull_gpups_timer.Pause();
 
     VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
@@ -945,6 +1049,97 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
     this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len,
                       static_cast<int>(slot_lengths.size()), hidden_size,
                       total_length);
+  } else {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "GpuPs: PullSparse Only Support CUDAPlace Now."));
+  }
+  all_timer.Pause();
+  VLOG(3) << "GpuPs PullSparse total costs: " << all_timer.ElapsedSec()
+          << " s, of which GPUPS costs: " << pull_gpups_timer.ElapsedSec()
+          << " s";
+  VLOG(3) << "End PullSparse";
+}
+
+void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
+                              const int table_id,
+                              const std::vector<const uint64_t*>& keys,
+                              const std::vector<float*>& values,
+                              const std::vector<int64_t>& slot_lengths,
+                              const std::vector<int>& slot_dim,
+                              const int hidden_size) {
+  VLOG(3) << "Begine Gpu Ps PullSparse";
+  platform::Timer all_timer;
+  platform::Timer pull_gpups_timer;
+  all_timer.Start();
+  size_t total_length =
+      std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
+  size_t feature_value_size = 0;
+
+  feature_value_size = TYPEALIGN(
+      8, sizeof(FeatureValue) + sizeof(float) * (index_dim_vec_.back() + 1));
+
+#ifdef PADDLE_WITH_CUDA
+  VLOG(3) << "Begine Gpu Ps PullSparse";
+  auto buf = memory::Alloc(place, total_length * feature_value_size);
+  FeatureValue* total_values_gpu = reinterpret_cast<FeatureValue*>(buf->ptr());
+#endif
+#ifdef PADDLE_WITH_XPU_KP
+  VLOG(3) << "Begine Xpu Ps PullSparse";
+  FeatureValue* total_values_gpu = nullptr;
+  xpu_malloc(reinterpret_cast<void**>(&total_values_gpu),
+             total_length * feature_value_size);
+#endif
+  if (platform::is_cpu_place(place)) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Warning:: CPUPlace is not supported in GpuPs now."));
+  } else if (platform::is_gpu_place(place)) {
+    VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
+    int device_id = place.GetDeviceId();
+    int devid_2_index = HeterPs_->get_index_by_devid(device_id);
+    LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
+    uint64_t* total_keys =
+        reinterpret_cast<uint64_t*>(total_keys_tensor.mutable_data<int64_t>(
+            {int64_t(total_length), 1}, place));
+
+    // construct slot_level lod info
+    auto slot_lengths_lod = slot_lengths;
+    for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
+      slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+    }
+    auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*));
+    auto buf_length =
+        memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
+    uint64_t** gpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
+    int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
+    cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*),
+               cudaMemcpyHostToDevice);
+    cudaMemcpy(gpu_len, slot_lengths_lod.data(),
+               slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
+
+    auto buf_dim = memory::Alloc(place, slot_dim.size() * sizeof(int));
+    int* gpu_dim = reinterpret_cast<int*>(buf_dim->ptr());
+    cudaMemcpy(gpu_dim, slot_dim.data(), slot_dim.size() * sizeof(int),
+               cudaMemcpyHostToDevice);
+
+    this->CopyKeys(place, gpu_keys, total_keys, gpu_len,
+                   static_cast<int>(slot_lengths.size()),
+                   static_cast<int>(total_length));
+    VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index
+            << " len: " << total_length;
+
+    pull_gpups_timer.Start();
+    HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu,
+                          total_length);
+
+    VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
+            << "]";
+
+    this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len,
+                      static_cast<int>(slot_lengths.size()), hidden_size,
+                      total_length, gpu_dim);
+
+    pull_gpups_timer.Pause();
+
 #endif
   } else if (platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU_KP
@@ -1013,7 +1208,10 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
       std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
   // #ifdef PADDLE_WITH_CUDA
   VLOG(3) << "Begin GPUPS PushSparseGrad";
-  auto buf = memory::Alloc(place, total_length * sizeof(FeaturePushValue));
+  size_t grad_value_size =
+      TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+  auto buf = memory::Alloc(place, total_length * grad_value_size);
+  VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_;
   FeaturePushValue* total_grad_values_gpu =
       reinterpret_cast<FeaturePushValue*>(buf->ptr());
   if (platform::is_cpu_place(place)) {
@@ -1027,8 +1225,13 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
     uint64_t* total_keys =
         reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
     VLOG(3) << "Begin copy grad tensor to gpups struct";
-    this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths,
-                      hidden_size, total_length, batch_size);
+    if (!multi_mf_dim_) {
+      this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths,
+                        hidden_size, total_length, batch_size);
+    } else {
+      this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths,
+                        total_length, batch_size, grad_value_size);
+    }
 
     VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index
             << " len: " << total_length;
@@ -1060,6 +1263,8 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
         "GPUPS: PushSparseGrad Only Support CUDAPlace Now."));
   }
   all_timer.Pause();
+  time_3 += all_timer.ElapsedSec();
+  time_4 += push_gpups_timer.ElapsedSec();
   VLOG(3) << "PushSparseGrad total cost: " << all_timer.ElapsedSec()
           << " s, of which GPUPS cost: " << push_gpups_timer.ElapsedSec()
           << " s";
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index 3df5a4b473861e..488a9ef8ce78ff 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -61,6 +61,45 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
   }
 }
 
+__global__ void PullCopy(float** dest, const FeatureValue* src,
+                         const int64_t* len, int slot_num, int total_len,
+                         uint64_t** keys, uint64_t max_val_size, int* gpu_dim) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[x - 1] : 0);
+    FeatureValue* feature_value_ptr =
+        (FeatureValue*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
+    int mf_dim = gpu_dim[x] - 3;
+    if (*(keys[x] + y) == 0) {
+      *(dest[x] + y * (mf_dim + 3)) = 0;
+      *(dest[x] + y * (mf_dim + 3) + 1) = 0;
+      *(dest[x] + y * (mf_dim + 3) + 2) = 0;
+    } else {
+      *(dest[x] + y * (mf_dim + 3)) = feature_value_ptr->show;
+      *(dest[x] + y * (mf_dim + 3) + 1) = feature_value_ptr->clk;
+      *(dest[x] + y * (mf_dim + 3) + 2) = feature_value_ptr->lr;
+    }
+    if ((feature_value_ptr)->mf_size == 0 || *(keys[x] + y) == 0) {
+      for (int j = 0; j < mf_dim; j++) {
+        *(dest[x] + y * (mf_dim + 3) + 3 + j) = 0;
+      }
+    } else {
+      for (int j = 0; j < mf_dim; j++) {
+        *(dest[x] + y * (mf_dim + 3) + 3 + j) = feature_value_ptr->mf[1 + j];
+      }
+    }
+  }
+}
+
 __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys,
                                const int64_t* len, int slot_num,
                                int total_len) {
@@ -105,6 +144,35 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, int64_t* len,
   }
 }
 
+__global__ void PushCopyWithPool(FeaturePushValue* dest, float** src,
+                                 int64_t* len, int slot_num, uint64_t total_len,
+                                 int bs, int* slot_vector, int* mf_dim_vector,
+                                 size_t grad_value_size) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[low - 1] : 0);
+    FeaturePushValue* cur =
+        (FeaturePushValue*)((char*)dest + i * grad_value_size);
+    cur->slot = slot_vector[x];
+    int mf_dim = mf_dim_vector[x];
+    cur->mf_dim = mf_dim;
+    cur->show = *(src[x] + y * (mf_dim + 3));
+    cur->clk = *(src[x] + y * (mf_dim + 3) + 1);
+    cur->lr_g = *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs;
+    for (int j = 0; j < cur->mf_dim; j++) {
+      cur->mf_g[j] = *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs;
+    }
+  }
+}
 PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; }
 
 void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
@@ -128,6 +196,26 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
   cudaStreamSynchronize(stream);
 }
 
+void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
+                               uint64_t** gpu_keys,
+                               const std::vector<float*>& values,
+                               const FeatureValue* total_values_gpu,
+                               const int64_t* gpu_len, const int slot_num,
+                               const int hidden_size,
+                               const int64_t total_length, int* gpu_dim) {
+  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+                    platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
+  float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
+  cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*),
+             cudaMemcpyHostToDevice);
+  PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      gpu_values, total_values_gpu, gpu_len, slot_num, total_length, gpu_keys,
+      val_type_size_, gpu_dim);
+  cudaStreamSynchronize(stream);
+}
+
 void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
                             uint64_t** origin_keys, uint64_t* total_keys,
                             const int64_t* gpu_len, int slot_num,
@@ -177,6 +265,45 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
   cudaStreamSynchronize(stream);
 }
 
+void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
+                               const std::vector<const float*>& grad_values,
+                               FeaturePushValue* total_grad_values_gpu,
+                               const std::vector<int64_t>& slot_lengths,
+                               const uint64_t total_length,
+                               const int batch_size, size_t grad_value_size) {
+  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+                    platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  auto slot_lengths_lod = slot_lengths;
+  for (int i = 1; i < slot_lengths_lod.size(); i++) {
+    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+  }
+  auto buf_grad_value =
+      memory::Alloc(place, grad_values.size() * sizeof(float*));
+  auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
+  auto buf_slot_vector =
+      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
+  auto buf_mf_dim_vector =
+      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
+  float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
+  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
+  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
+  int* d_mf_dim_vector = reinterpret_cast<int*>(buf_mf_dim_vector->ptr());
+  cudaMemcpy(gpu_values, grad_values.data(),
+             grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_len, slot_lengths_lod.data(),
+             slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_slot_vector, slot_vector_.data(),
+             slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_mf_dim_vector, slot_mf_dim_vector_.data(),
+             slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
+  PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      total_grad_values_gpu, gpu_values, gpu_len, slot_lengths.size(),
+      total_length, batch_size, d_slot_vector, d_mf_dim_vector,
+      grad_value_size);
+  cudaStreamSynchronize(stream);
+}
+
 void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
                                 float min_bound, float max_bound,
                                 float learning_rate, float initial_g2sum,
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index c38b819822c28b..3addf23ce82071 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -27,6 +27,7 @@ limitations under the License. */
 #include <vector>
 #ifdef PADDLE_WITH_GLOO
 #include <gloo/broadcast.h>
+#include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 #include "paddle/fluid/distributed/ps/thirdparty/round_robin.h"
@@ -54,6 +55,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_PSLIB
 #include "afs_api.h"
 #endif
+#ifdef PADDLE_WITH_PSLIB
+#include "downpour_accessor.h"  // NOLINT
+#endif
 
 namespace paddle {
 namespace framework {
@@ -95,12 +99,21 @@ class PSGPUWrapper {
   PSGPUWrapper() {
     HeterPs_ = NULL;
     sleep_seconds_before_fail_exit_ = 300;
+    pull_thread_pool_.resize(thread_keys_shard_num_);
+    for (size_t i = 0; i < pull_thread_pool_.size(); i++) {
+      pull_thread_pool_[i].reset(new ::ThreadPool(1));
+    }
     hbm_thread_pool_.resize(thread_keys_shard_num_);
     for (size_t i = 0; i < hbm_thread_pool_.size(); i++) {
       hbm_thread_pool_[i].reset(new ::ThreadPool(1));
     }
   }
 
+  void PullSparse(const paddle::platform::Place& place, const int table_id,
+                  const std::vector<const uint64_t*>& keys,
+                  const std::vector<float*>& values,
+                  const std::vector<int64_t>& slot_lengths,
+                  const std::vector<int>& slot_dim, const int hidden_size);
   void PullSparse(const paddle::platform::Place& place, const int table_id,
                   const std::vector<const uint64_t*>& keys,
                   const std::vector<float*>& values,
@@ -119,13 +132,23 @@ class PSGPUWrapper {
                    const FeatureValue* total_values_gpu, const int64_t* gpu_len,
                    const int slot_num, const int hidden_size,
                    const int64_t total_length);
-
+  void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys,
+                   const std::vector<float*>& values,
+                   const FeatureValue* total_values_gpu, const int64_t* gpu_len,
+                   const int slot_num, const int hidden_size,
+                   const int64_t total_length, int* gpu_dim);
   void CopyForPush(const paddle::platform::Place& place,
                    const std::vector<const float*>& grad_values,
                    FeaturePushValue* total_grad_values_gpu,
                    const std::vector<int64_t>& slot_lengths,
                    const int hidden_size, const int64_t total_length,
                    const int batch_size);
+  void CopyForPush(const paddle::platform::Place& place,
+                   const std::vector<const float*>& grad_values,
+                   FeaturePushValue* total_grad_values_gpu,
+                   const std::vector<int64_t>& slot_lengths,
+                   const uint64_t total_length, const int batch_size,
+                   size_t grad_value_size);
 
   void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task);
   void PreBuildTask(std::shared_ptr<HeterContext> gpu_task);
@@ -310,13 +333,40 @@ class PSGPUWrapper {
 
   void SetSlotOffsetVector(const std::vector<int>& slot_offset_vector) {
     slot_offset_vector_ = slot_offset_vector;
+    std::cout << "yxf set: ";
+    for (auto s : slot_offset_vector_) {
+      std::cout << s << " | ";
+    }
+    std::cout << " end " << std::endl;
   }
 
 #ifdef PADDLE_WITH_CUDA
   void SetSlotDimVector(const std::vector<int>& slot_mf_dim_vector) {
     slot_mf_dim_vector_ = slot_mf_dim_vector;
     assert(slot_mf_dim_vector_.size() == slot_vector_.size());
-    for (size_t i = 0; i < slot_mf_dim_vector.size(); i++) {
+  }
+
+  void InitSlotInfo() {
+    if (slot_info_initialized_) {
+      return;
+    }
+    SlotRecordDataset* dataset = dynamic_cast<SlotRecordDataset*>(dataset_);
+    auto slots_vec = dataset->GetSlots();
+    slot_offset_vector_.clear();
+    for (auto& slot : slot_vector_) {
+      for (size_t i = 0; i < slots_vec.size(); ++i) {
+        if (std::to_string(slot) == slots_vec[i]) {
+          slot_offset_vector_.push_back(i);
+          break;
+        }
+      }
+    }
+    std::cout << "psgpu wrapper use slots: ";
+    for (auto s : slot_offset_vector_) {
+      std::cout << s << " | ";
+    }
+    std::cout << " end " << std::endl;
+    for (size_t i = 0; i < slot_mf_dim_vector_.size(); i++) {
       slot_dim_map_[slot_vector_[i]] = slot_mf_dim_vector_[i];
     }
 
@@ -345,6 +395,7 @@ class PSGPUWrapper {
         TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1));
     grad_type_size_ =
         TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+    slot_info_initialized_ = true;
   }
 #endif
 
@@ -385,9 +436,16 @@ class PSGPUWrapper {
   int max_mf_dim_{0};
   size_t val_type_size_{0};
   size_t grad_type_size_{0};
+
+  double time_1 = 0.0;
+  double time_2 = 0.0;
+  double time_3 = 0.0;
+  double time_4 = 0.0;
+
   int multi_node_{0};
   int node_size_;
   uint64_t table_id_;
+  int gpu_graph_mode_ = 1;
 #ifdef PADDLE_WITH_CUDA
   std::vector<ncclComm_t> inner_comms_;
   std::vector<ncclComm_t> inter_comms_;
@@ -405,6 +463,7 @@ class PSGPUWrapper {
   int year_;
   int month_;
   int day_;
+  bool slot_info_initialized_ = false;
   int use_afs_api_ = 0;
 
 #ifdef PADDLE_WITH_CUDA
@@ -428,6 +487,7 @@ class PSGPUWrapper {
   std::shared_ptr<HeterContext> current_task_ = nullptr;
   std::thread pre_build_threads_;
   bool running_ = false;
+  std::vector<std::shared_ptr<ThreadPool>> pull_thread_pool_;
   std::vector<std::shared_ptr<ThreadPool>> hbm_thread_pool_;
 
  protected:
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index cb33e87f490c25..cee122e540f7e1 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -118,6 +118,11 @@ void HogwildWorker::CreateDeviceResource(const ProgramDesc &main_prog) {
 
 void HogwildWorker::TrainFilesWithProfiler() {
   platform::SetNumThreads(1);
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  platform::SetDeviceId(thread_id_);
+#elif defined(PADDLE_WITH_XPU_BKCL)
+  platform::SetXPUDeviceId(thread_id_);
+#endif
   device_reader_->Start();
   std::vector<double> op_total_time;
   std::vector<std::string> op_name;
@@ -174,8 +179,6 @@ void HogwildWorker::TrainFilesWithProfiler() {
     PrintFetchVars();
 #ifdef PADDLE_WITH_HETERPS
     dev_ctx_->Wait();
-    VLOG(1) << "GpuPs worker " << thread_id_ << " train cost " << total_time
-            << " seconds, ins_num: " << total_inst;
     for (size_t i = 0; i < op_name.size(); ++i) {
       VLOG(1) << "card:" << thread_id_ << ", op: " << op_name[i]
               << ", mean time: " << op_total_time[i] / total_inst
@@ -197,6 +200,9 @@ void HogwildWorker::TrainFilesWithProfiler() {
     thread_scope_->DropKids();
     timeline.Start();
   }
+  VLOG(0) << "GpuPs worker " << thread_id_ << " train cost " << total_time
+          << " seconds, ins_num: " << total_inst << " read time: " << read_time
+          << "seconds ";
 
   if (need_dump_field_ || need_dump_param_) {
     writer_.Flush();
@@ -213,12 +219,21 @@ void HogwildWorker::TrainFiles() {
   platform::SetNumThreads(1);
   platform::Timer timeline;
   timeline.Start();
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  platform::SetDeviceId(thread_id_);
+#elif defined(PADDLE_WITH_XPU_BKCL)
+  platform::SetXPUDeviceId(thread_id_);
+#endif
 
   int total_ins_num = 0;
   // how to accumulate fetched values here
   device_reader_->Start();
   int cur_batch;
   int batch_cnt = 0;
+
+#if defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_CUDA)
+  platform::SetDeviceId(thread_id_);
+#endif
   while ((cur_batch = device_reader_->Next()) > 0) {
     for (auto &op : ops_) {
       bool need_skip = false;
@@ -244,9 +259,12 @@ void HogwildWorker::TrainFiles() {
     ++batch_cnt;
     PrintFetchVars();
     thread_scope_->DropKids();
+#ifdef PADDLE_WITH_HETERPS
+    dev_ctx_->Wait();
+#endif
   }
   timeline.Pause();
-  VLOG(3) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
+  VLOG(0) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
           << " seconds, ins_num: " << total_ins_num;
 
   if (need_dump_field_ || need_dump_param_) {
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index 7a83fdccc218c4..6479f7ae726548 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -148,6 +148,17 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
     }
   }
 #endif
+  for (auto& var : main_program.Block(0).AllVars()) {
+    if (var->Persistable()) {
+      auto it = std::find(need_merge_var_names_.begin(),
+                          need_merge_var_names_.end(), var->Name());
+      if (it == need_merge_var_names_.end() &&
+          var->GetType() != proto::VarType::SELECTED_ROWS) {
+        VLOG(2) << "train param: " << var->Name();
+        trainable_param_.push_back(var->Name());
+      }
+    }
+  }
 }
 
 void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) {
@@ -192,18 +203,30 @@ void MultiTrainer::Run() {
 
 #ifdef PADDLE_WITH_HETERPS
 void MultiTrainer::MergeDenseParam() {
-#ifdef PADDLE_WTIH_PSCORE
+#ifdef PADDLE_WITH_PSCORE
   auto communicator = paddle::distributed::Communicator::GetInstance();
-  auto& recv_ctx = communicator->GetRecvCtxMap();
-  Scope* thread_scope = workers_[0]->GetThreadScope();
-  for (auto& iter : recv_ctx) {
-    auto& varnames = iter.second;
-    for (auto& name : varnames) {
+  auto thread_scope = workers_[0]->GetThreadScope();
+  if (communicator == nullptr) {
+    for (auto& name : trainable_param_) {
+      VLOG(2) << "merge var " << name << " to root scope";
       Variable* root_var = root_scope_->FindVar(name);
       LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
       Variable* var = thread_scope->FindVar(name);
       LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      TensorCopy((*tensor), root_tensor->place(), root_tensor);
+      TensorCopySync((*tensor), root_tensor->place(), root_tensor);
+    }
+  } else {
+    auto& recv_ctx = communicator->GetRecvCtxMap();
+    for (auto& iter : recv_ctx) {
+      auto& varnames = iter.second;
+      for (auto& name : varnames) {
+        VLOG(2) << "merge var " << name << " to root scope";
+        Variable* root_var = root_scope_->FindVar(name);
+        LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+        Variable* var = thread_scope->FindVar(name);
+        LoDTensor* tensor = var->GetMutable<LoDTensor>();
+        TensorCopySync((*tensor), root_tensor->place(), root_tensor);
+      }
     }
   }
 #endif
@@ -236,11 +259,7 @@ void MultiTrainer::Finalize() {
     }
     LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
 
-#ifdef PADDLE_WITH_HETERPS
-    for (size_t j = 0; j < places_.size(); j++) {
-#else
     for (int j = 1; j < thread_num_; j++) {
-#endif
       Scope* cur_thread_scope = workers_[j]->GetThreadScope();
       Variable* thread_var =
           cur_thread_scope->FindVar(need_merge_var_names_[i]);
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index 6046000739976c..92643a254f8efb 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -27,8 +27,8 @@ cc_library(staticgraph_executor_statistics SRCS executor_statistics.cc DEPS enfo
 if (WITH_GPU AND WITH_TESTING AND NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
     add_custom_target(
         download_program
-        COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_main_program 
-        COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_startup_program 
+        COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_main_program --no-check-certificate
+        COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_startup_program --no-check-certificate 
     )
     
     # all operators used in the program
diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc
index ad1ddbfabd0911..b7674e06b9f73d 100644
--- a/paddle/fluid/framework/ps_gpu_worker.cc
+++ b/paddle/fluid/framework/ps_gpu_worker.cc
@@ -128,16 +128,16 @@ void PSGPUWorker::TrainFiles() {
   timeline.Start();
 
   int total_ins_num = 0;
-
-  // how to accumulate fetched values here
-  device_reader_->Start();
-  int cur_batch;
-  int batch_cnt = 0;
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   platform::SetDeviceId(thread_id_);
 #elif defined(PADDLE_WITH_XPU_BKCL)
   platform::SetXPUDeviceId(thread_id_);
 #endif
+
+  // how to accumulate fetched values here
+  device_reader_->Start();
+  int cur_batch;
+  int batch_cnt = 0;
   while ((cur_batch = device_reader_->Next()) > 0) {
     total_ins_num += cur_batch;
     for (auto& op : ops_) {
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index b86b4fec8a5718..c78f7611b63bee 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -129,6 +129,7 @@ class MultiTrainer : public TrainerBase {
   std::vector<DataFeed*> readers_;
   std::vector<std::shared_ptr<DeviceWorker>> workers_;
   std::vector<std::string> need_merge_var_names_;
+  std::vector<std::string> trainable_param_;
 #ifdef PADDLE_WITH_HETERPS
   std::vector<platform::Place> places_;
 #endif
diff --git a/paddle/fluid/inference/api/paddle_infer_declare.h b/paddle/fluid/inference/api/paddle_infer_declare.h
index e8525f440fe7f2..44eacc6a70554a 100644
--- a/paddle/fluid/inference/api/paddle_infer_declare.h
+++ b/paddle/fluid/inference/api/paddle_infer_declare.h
@@ -23,5 +23,7 @@
 #endif  // PADDLE_DLL_INFERENCE
 #endif  // PD_INFER_DECL
 #else
+#ifndef PD_INFER_DECL
 #define PD_INFER_DECL __attribute__((visibility("default")))
+#endif  // PD_INFER_DECL
 #endif  // _WIN32
diff --git a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
new file mode 100644
index 00000000000000..c8ab269c023a5b
--- /dev/null
+++ b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
@@ -0,0 +1,129 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+constexpr int64_t kNoPadding = -1;
+
+template <typename T>
+class LookupTableV2MLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");      // int tensor
+    auto *output_t = ctx.Output<framework::LoDTensor>("Out");  // float tensor
+    auto *table_t = ctx.Input<framework::LoDTensor>("W");
+
+    auto *table_var = ctx.InputVar("W");
+    PADDLE_ENFORCE_EQ(
+        table_var->IsType<framework::LoDTensor>(), true,
+        platform::errors::InvalidArgument("mlu only accept LoDTensor"));
+    output_t->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc ids_desc(*ids_t);
+    MLUCnnlTensorDesc table_desc(*table_t);
+    MLUCnnlTensorDesc output_desc(*output_t);
+
+    int64_t padding_idx = ctx.Attr<int64_t>("padding_idx");
+    if (padding_idx == kNoPadding) {
+      MLUCnnl::GatherFunctor(ctx, /*axis=*/0, /*batch_dims=*/0,
+                             table_desc.get(), GetBasePtr(table_t),
+                             ids_desc.get(), GetBasePtr(ids_t),
+                             output_desc.get(), GetBasePtr(output_t));
+    } else {
+      Tensor tmp_table_t(table_t->type());
+      tmp_table_t.mutable_data<T>(table_t->dims(), ctx.GetPlace());
+
+      Tensor index;
+      index.mutable_data<int32_t>({1, 1}, ctx.GetPlace());
+      auto idx_value = static_cast<int32_t>(padding_idx);
+      MLUCnnlTensorDesc index_desc(index);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &idx_value, index_desc.get(),
+                    GetBasePtr(&index));
+
+      auto update_dim = phi::make_ddim({1, table_t->dims()[1]});
+      Tensor update;
+      update.mutable_data<T>(update_dim, ctx.GetPlace());
+
+      auto update_value = static_cast<T>(0);
+      MLUCnnlTensorDesc update_desc(update);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &update_value,
+                    update_desc.get(), GetBasePtr(&update));
+
+      MLUCnnlTensorDesc tmp_table_desc(tmp_table_t);
+      MLUCnnl::ScatterNd(
+          ctx, CNNL_SCATTERND_UPDATE, index_desc.get(), GetBasePtr(&index),
+          update_desc.get(), GetBasePtr(&update), table_desc.get(),
+          GetBasePtr(table_t), tmp_table_desc.get(), GetBasePtr(&tmp_table_t));
+
+      MLUCnnl::GatherFunctor(ctx, /*axis=*/0, /*batch_dims=*/0,
+                             tmp_table_desc.get(), GetBasePtr(&tmp_table_t),
+                             ids_desc.get(), GetBasePtr(ids_t),
+                             output_desc.get(), GetBasePtr(output_t));
+    }
+  }
+};
+
+template <typename T>
+class LookupTableV2GradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");
+    auto *output_grad_t =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto *table_grad_t =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
+    table_grad_t->mutable_data<T>(ctx.GetPlace());
+
+    int padding_idx = static_cast<int>(ctx.Attr<int64_t>("padding_idx"));
+
+    Tensor ids_int32(ids_t->dtype());
+    if (ids_t->dtype() != DataType::INT32) {
+      ids_int32.mutable_data<int>(ids_t->dims(), ctx.GetPlace());
+      MLUCnnlTensorDesc ids_desc(*ids_t);
+      MLUCnnlTensorDesc ids_int32_desc(ids_int32);
+      auto cast_type = GetCastDataType(ids_t->dtype(), DataType::INT32);
+      MLUCnnl::Cast(ctx, cast_type, ids_desc.get(), GetBasePtr(ids_t),
+                    ids_int32_desc.get(), GetBasePtr(&ids_int32));
+    } else {
+      ids_int32 = *ids_t;
+    }
+
+    MLUCnnlTensorDesc ids_int32_desc(ids_int32);
+    MLUCnnlTensorDesc output_grad_desc(*output_grad_t);
+    MLUCnnlTensorDesc table_grad_desc(*table_grad_t);
+
+    MLUCnnl::EmbeddingBackward(ctx, padding_idx, false, ids_int32_desc.get(),
+                               GetBasePtr(&ids_int32), output_grad_desc.get(),
+                               GetBasePtr(output_grad_t), table_grad_desc.get(),
+                               GetBasePtr(table_grad_t));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(lookup_table_v2, ops::LookupTableV2MLUKernel<float>,
+                       ops::LookupTableV2MLUKernel<int>,
+                       ops::LookupTableV2MLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(lookup_table_v2_grad,
+                       ops::LookupTableV2GradMLUKernel<float>,
+                       ops::LookupTableV2GradMLUKernel<int>,
+                       ops::LookupTableV2GradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index ecee094de346e6..393247644c2e88 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -44,14 +44,6 @@ class MKLDNNActivationKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    PADDLE_ENFORCE_EQ(
-        x->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument("Wrong layout set for X tensor"));
-    PADDLE_ENFORCE_NE(
-        x->format(), MKLDNNMemoryFormat::undef,
-        platform::errors::InvalidArgument("Wrong format set for X tensor"));
-
     Functor functor;
     functor(ctx);
   }
@@ -62,14 +54,6 @@ class MKLDNNActivationGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN,
-                      platform::errors::InvalidArgument(
-                          "Wrong layout set for Input OutGrad tensor"));
-    PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::undef,
-                      platform::errors::InvalidArgument(
-                          "Wrong format set for Input OutGrad tensor"));
-
     Functor functor;
     functor(ctx);
   }
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index b10572edf6f273..747e4603d7fe77 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -36,100 +36,58 @@ template <typename T>
 class DeQuantOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto scale_data = ctx.Attr<float>("Scale");
-    auto scale_shift = ctx.Attr<float>("Shift");
-    bool with_shift = scale_shift != 0.0f;
-    auto* output = ctx.Output<Tensor>("Output");
-
-    PADDLE_ENFORCE_NE(scale_data, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "Dequantization scale cannot be 0.0"));
-    PADDLE_ENFORCE_GE(scale_shift, 0,
-                      platform::errors::Unimplemented(
-                          "Dequantization shift must be nonnegative."));
-    PADDLE_ENFORCE_LE(
-        scale_shift, 255,
-        platform::errors::Unimplemented(
-            "Dequantization shift must be less than or equal to 255."));
+    auto* x = ctx.Input<Tensor>("Input");
+    const auto quantization_scale = ctx.Attr<float>("Scale");
+    const auto quantization_shift = ctx.Attr<float>("Shift");
+    const bool with_shift = quantization_shift != 0.0f;
+    auto* out = ctx.Output<Tensor>("Output");
+
+    PADDLE_ENFORCE(quantization_scale != 0.0f,
+                   platform::errors::InvalidArgument(
+                       "Dequantization scale must be different than 0.0f"));
+
+    PADDLE_ENFORCE(
+        quantization_shift <= 255 && quantization_shift >= 0,
+        platform::errors::InvalidArgument(
+            "Dequantization shift must be lower or equal to ",
+            "255 and greater or equal to 0, but got %f", quantization_shift));
 
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& engine = dev_ctx.GetEngine();
-
-    const T* input_data = input->data<T>();
-    float* output_data = output->mutable_data<float>(ctx.GetPlace());
-
-    float reorder_shift = -scale_shift / scale_data;
-
-    auto src_tz = phi::vectorize<int64_t>(input->dims());
-    auto dst_tz = phi::vectorize<int64_t>(output->dims());
-    dnnl::memory::data_type src_dt = paddle::framework::ToMKLDNNDataType(
-        framework::TransToProtoVarType(input->dtype()));
-    MKLDNNMemoryFormat src_fmt = input->format();
-
-    std::string key =
-        platform::CreateKey(dev_ctx, src_dt, src_tz, ctx.OutputName("Output"));
-    key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
-
-    const std::string key_prim = key + "@r";
-    const std::string key_src_mem = key + "@s";
-    const std::string key_dst_mem = key + "@d";
-
-    std::shared_ptr<dnnl::memory> src_memory;
-    std::shared_ptr<dnnl::memory> dst_memory;
-    std::shared_ptr<reorder> reorder_p;
-    reorder_p = std::static_pointer_cast<reorder>(dev_ctx.GetBlob(key_prim));
-
-    if (reorder_p == nullptr) {
-      dnnl::primitive_attr attri;
-      int mask = 0;
-      float reorder_scale = 1. / scale_data;
-      attri.set_output_scales(mask, {reorder_scale});
-
-      if (with_shift) {
-        dnnl::post_ops post_operations;
-        post_operations.append_sum();
-        attri.set_post_ops(post_operations);
-        std::fill(output_data, output_data + output->numel(), reorder_shift);
-      }
-
-      auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt);
-      src_memory = std::make_shared<dnnl::memory>(src_md, engine,
-                                                  to_void_cast<T>(input_data));
-
-      auto dst_md =
-          platform::MKLDNNMemDesc({dst_tz}, memory::data_type::f32,
-                                  platform::MKLDNNFormatForSize(
-                                      dst_tz.size(), MKLDNNMemoryFormat::nchw));
-
-      dst_memory = std::make_shared<dnnl::memory>(
-          dst_md, engine, to_void_cast<float>(output_data));
-
-      auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
-          new reorder::primitive_desc(*src_memory, *dst_memory, attri));
-      reorder_p = std::shared_ptr<reorder>(new reorder(*reorder_pd));
-      dev_ctx.SetBlob(key_prim, reorder_p);
-      dev_ctx.SetBlob(key_src_mem, src_memory);
-      dev_ctx.SetBlob(key_dst_mem, dst_memory);
-    } else {
-      src_memory =
-          std::static_pointer_cast<dnnl::memory>(dev_ctx.GetBlob(key_src_mem));
-      src_memory->set_data_handle(to_void_cast<T>(input_data));
-
-      dst_memory =
-          std::static_pointer_cast<dnnl::memory>(dev_ctx.GetBlob(key_dst_mem));
-      if (with_shift)
-        std::fill(output_data, output_data + output->numel(), reorder_shift);
-      dst_memory->set_data_handle(output->mutable_data<float>(ctx.GetPlace()));
+
+    auto x_tz = phi::vectorize<int64_t>(x->dims());
+    auto x_paddle_dtype = framework::TransToProtoVarType(x->dtype());
+    auto out_paddle_dtype = framework::TransToProtoVarType(out->dtype());
+
+    dnnl::primitive_attr attrs;
+    static constexpr int32_t mask = 0;  // same shift and scale for whole tensor
+
+    const float reorder_scale = 1. / quantization_scale;
+    attrs.set_output_scales(mask, {reorder_scale});
+
+    if (with_shift) {
+      attrs.set_zero_points(DNNL_ARG_SRC, mask,
+                            {static_cast<int32_t>(quantization_shift)});
     }
 
+    platform::ReorderMKLDNNHandler reorder_handler(
+        x_tz, x_paddle_dtype, framework::ToMKLDNNDataType(x_paddle_dtype),
+        out_paddle_dtype, framework::ToMKLDNNDataType(out_paddle_dtype),
+        dev_ctx.GetEngine());
+
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        x->mem_desc(), platform::to_void_cast(x->data<T>()));
+    auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+        out, x->mem_desc(), dev_ctx.GetPlace());
+
+    auto reorder_p = reorder_handler.AcquireReorder(
+        reorder_dst_memory_p, reorder_src_memory_p, attrs);
+
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    reorder_p->execute(astream, *src_memory, *dst_memory);
+    reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
     astream.wait();
 
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(*dst_memory));
+    out->set_mem_desc(reorder_dst_memory_p->get_desc());
   }
 };
 
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 4cae3f0c737115..8cbe46bee481ab 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "dnnl.hpp"
+#include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/quantize_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -34,83 +35,73 @@ template <typename T>
 class QuantOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto scale_data = ctx.Attr<float>("Scale");
-    auto scale_shift = ctx.Attr<float>("Shift");
-    bool with_shift = scale_shift != 0.0f;
-    auto* output = ctx.Output<Tensor>("Output");
-
-    PADDLE_ENFORCE_NE(
-        scale_data, 0.0f,
-        platform::errors::InvalidArgument("Quantization scale cannot be 0.0"));
-    PADDLE_ENFORCE_GE(scale_shift, 0,
-                      platform::errors::Unimplemented(
-                          "Quantization shift must be nonnegative."));
-    PADDLE_ENFORCE_LE(
-        scale_shift, 255,
-        platform::errors::Unimplemented(
-            "Quantization shift must be less than or equal to 255."));
+    auto* x = ctx.Input<Tensor>("Input");
+    auto* out = ctx.Output<Tensor>("Output");
+
+    const auto quantization_scale = ctx.Attr<float>("Scale");
+    const auto quantization_shift = ctx.Attr<float>("Shift");
+    const bool with_scale = quantization_scale != 1.0f;
+    const bool with_shift = quantization_shift != 0.0f;
+
+    PADDLE_ENFORCE_NE(quantization_scale, 0.0f,
+                      platform::errors::InvalidArgument(
+                          "Quantization scale must be different than 0.0f"));
+    PADDLE_ENFORCE(
+        quantization_shift <= 255 && quantization_shift >= 0,
+        platform::errors::InvalidArgument(
+            "Quantization shift must be lower or equal to ",
+            "255 and greater or equal to 0, but got %f", quantization_shift));
 
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& engine = dev_ctx.GetEngine();
 
-    std::vector<primitive> pipeline;
-    auto src_tz = phi::vectorize<int64_t>(input->dims());
-    auto dst_tz = phi::vectorize<int64_t>(output->dims());
+    auto x_tz = phi::vectorize<int64_t>(x->dims());
 
-    const T* input_data = input->data<T>();
+    const bool is_negative_input = ctx.Attr<bool>("is_negative_input");
+    const bool bfloat16 = ctx.Attr<bool>("bfloat16");
 
-    bool is_negative_input = ctx.Attr<bool>("is_negative_input");
-    bool bfloat16 = ctx.Attr<bool>("bfloat16");
+    dnnl::primitive_attr attrs;
+    static constexpr int32_t mask = 0;
 
-    // TODO(jczaja): Refactor with Acquire API
-    std::shared_ptr<dnnl::memory> src_memory;
-    std::shared_ptr<dnnl::memory> dst_memory;
-    std::shared_ptr<reorder> reorder_p;
-
-    std::string out_layout = ctx.Attr<std::string>("output_format");
-    MKLDNNMemoryFormat out_format =
-        platform::data_format_to_memory_format(out_layout);
-    dnnl::primitive_attr attri;
-    int mask = 0;
-    attri.set_output_scales(mask, {scale_data});
+    if (with_scale) {
+      attrs.set_output_scales(mask, {quantization_scale});
+    }
 
     if (with_shift) {
-      dnnl::post_ops post_operations;
-      post_operations.append_sum();
-      attri.set_post_ops(post_operations);
-      uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace());
-      // memset casts scale_shift to unsigned char (uint8_t) internally
-      std::memset(output_data, scale_shift, output->numel());
+      attrs.set_zero_points(DNNL_ARG_DST, mask,
+                            {static_cast<int32_t>(quantization_shift)});
     }
 
-    auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32,
-                                          input->format());
-    src_memory = std::make_shared<dnnl::memory>(src_md, engine,
-                                                to_void_cast<T>(input_data));
+    framework::proto::VarType::Type x_paddle_dtype =
+        framework::TransToProtoVarType(x->dtype());
+    framework::proto::VarType::Type out_paddle_dtype;
 
-    std::shared_ptr<dnnl::memory::desc> dst_md;
     if (bfloat16) {
-      platform::SetDstMemoryQuantized<paddle::platform::bfloat16>(
-          ctx, output, dst_tz, engine, dst_md, dst_memory, out_format);
+      out_paddle_dtype = framework::proto::VarType::BF16;
     } else if (is_negative_input && !with_shift) {
-      platform::SetDstMemoryQuantized<int8_t>(ctx, output, dst_tz, engine,
-                                              dst_md, dst_memory, out_format);
+      out_paddle_dtype = framework::proto::VarType::INT8;
     } else {
-      platform::SetDstMemoryQuantized<uint8_t>(ctx, output, dst_tz, engine,
-                                               dst_md, dst_memory, out_format);
+      out_paddle_dtype = framework::proto::VarType::UINT8;
     }
-    auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
-        new reorder::primitive_desc(*src_memory, *dst_memory, attri));
-    reorder_p = std::shared_ptr<reorder>(new reorder(*reorder_pd));
+
+    platform::ReorderMKLDNNHandler reorder_handler(
+        x_tz, x_paddle_dtype, framework::ToMKLDNNDataType(x_paddle_dtype),
+        out_paddle_dtype, framework::ToMKLDNNDataType(out_paddle_dtype),
+        dev_ctx.GetEngine());
+
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        x->mem_desc(), platform::to_void_cast(x->data<T>()));
+    auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+        out, x->mem_desc(), dev_ctx.GetPlace());
+
+    auto reorder_p = reorder_handler.AcquireReorder(
+        reorder_dst_memory_p, reorder_src_memory_p, attrs);
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    reorder_p->execute(astream, *src_memory, *dst_memory);
+    reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
     astream.wait();
 
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(*dst_memory));
+    out->set_mem_desc(reorder_dst_memory_p->get_desc());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 867c5f212ba6c1..9d3b8e2407fbfb 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -34,6 +34,12 @@ cnnlCastDataType_t GetCastDataType(const VT::Type& src_type,
   return cast_type;
 }
 
+cnnlCastDataType_t GetCastDataType(const DataType& src_type,
+                                   const DataType& dst_type) {
+  return GetCastDataType(framework::TransToProtoVarType(src_type),
+                         framework::TransToProtoVarType(dst_type));
+}
+
 bool MLUSupportsCast(const VT::Type& src_type, const VT::Type& dst_type) {
   for (auto it = MLU_SUPPORTED_CAST_TYPE.begin();
        it != MLU_SUPPORTED_CAST_TYPE.end(); ++it) {
@@ -2713,17 +2719,16 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                         output_desc, output));
 }
 
-/* static */ void MLUCnnl::ScatterNd(const ExecutionContext& ctx,
-                                     const cnnlTensorDescriptor_t indices_desc,
-                                     const void* indices,
-                                     const cnnlTensorDescriptor_t updates_desc,
-                                     const void* updates,
-                                     const cnnlTensorDescriptor_t output_desc,
-                                     void* output) {
+/* static */ void MLUCnnl::ScatterNd(
+    const ExecutionContext& ctx, cnnlScatterNdMode_t mode,
+    const cnnlTensorDescriptor_t indices_desc, const void* indices,
+    const cnnlTensorDescriptor_t updates_desc, const void* updates,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlScatterNd(handle, indices_desc, indices,
-                                           updates_desc, updates, output_desc,
-                                           output));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlScatterNd_v2(handle, mode, indices_desc, indices, updates_desc,
+                       updates, input_desc, input, output_desc, output));
 }
 
 /* static */ void MLUCnnl::BitWise(
@@ -2777,5 +2782,26 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
       cnnlReciprocal(handle, input_desc, input, output_desc, output));
 }
 
+/* static */ void MLUCnnl::EmbeddingBackward(
+    const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq,
+    const cnnlTensorDescriptor_t indices_desc, const void* indices,
+    const cnnlTensorDescriptor_t diff_desc, const void* diff,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetEmbeddingBackwardWorkspaceSize(
+      handle, diff_desc, output_desc, scale_grad_by_freq, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlEmbeddingBackward(
+      handle, padding_idx, scale_grad_by_freq, indices_desc, indices, diff_desc,
+      diff, workspace_ptr, workspace_size, output_desc, output));
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 24db6c760d78ab..f048ac7c5c3be0 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -175,6 +175,10 @@ const std::map<std::pair<VT::Type, VT::Type>, cnnlCastDataType_t>
 
 cnnlCastDataType_t GetCastDataType(const VT::Type& src_type,
                                    const VT::Type& dst_type);
+
+cnnlCastDataType_t GetCastDataType(const DataType& src_type,
+                                   const DataType& dst_type);
+
 bool MLUSupportsCast(const VT::Type& src_type, const VT::Type& dst_type);
 
 cnnlDeviceType_t GetCnnlDev(int dev_ordinal);
@@ -1202,11 +1206,13 @@ class MLUCnnl {
                      const void* k, const int k_int,
                      const cnnlTensorDescriptor_t output_desc, void* output);
 
-  static void ScatterNd(const ExecutionContext& ctx,
+  static void ScatterNd(const ExecutionContext& ctx, cnnlScatterNdMode_t mode,
                         const cnnlTensorDescriptor_t indices_desc,
                         const void* indices,
                         const cnnlTensorDescriptor_t updates_desc,
                         const void* updates,
+                        const cnnlTensorDescriptor_t input_desc,
+                        const void* input,
                         const cnnlTensorDescriptor_t output_desc, void* output);
 
   static void BitWise(const ExecutionContext& ctx,
@@ -1227,6 +1233,12 @@ class MLUCnnl {
                          const void* input,
                          const cnnlTensorDescriptor_t output_desc,
                          void* output);
+
+  static void EmbeddingBackward(
+      const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq,
+      const cnnlTensorDescriptor_t indices_desc, const void* indices,
+      const cnnlTensorDescriptor_t diff_desc, const void* diff,
+      const cnnlTensorDescriptor_t output_desc, void* output);
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.h b/paddle/fluid/operators/pull_gpups_sparse_op.h
index f721608cffb082..abfdb62ec34ac3 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.h
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.h
@@ -26,6 +26,7 @@ template <typename T>
 static void PullGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
   auto inputs = ctx.MultiInput<framework::Tensor>("Ids");
   auto outputs = ctx.MultiOutput<framework::Tensor>("Out");
+  auto embedding_size_vec = ctx.Attr<std::vector<int>>("size");
   const auto slot_size = inputs.size();
   std::vector<const uint64_t *> all_keys(slot_size);
   // GpuPSPS only supports float now
@@ -44,7 +45,7 @@ static void PullGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
 #ifdef PADDLE_WITH_HETERPS
   auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance();
   gpu_ps_ptr->PullSparse(ctx.GetPlace(), 0, all_keys, all_values, slot_lengths,
-                         0);
+                         embedding_size_vec, 0);
 #endif
 }
 
diff --git a/paddle/fluid/operators/unstack_op_mlu.cc b/paddle/fluid/operators/unstack_op_mlu.cc
new file mode 100644
index 00000000000000..9c4dd256a94efe
--- /dev/null
+++ b/paddle/fluid/operators/unstack_op_mlu.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class UnStackMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto out = ctx.MultiOutput<Tensor>("Y");
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis += x->dims().size();
+    int num = x->dims()[axis];
+
+    std::vector<MLUCnnlTensorDesc> out_descs;
+    std::vector<cnnlTensorDescriptor_t> out_raw_descs;
+    std::vector<void *> out_ptrs;
+    std::vector<int64_t> new_dims = phi::vectorize(x->dims());
+    new_dims[axis] = 1;
+    for (int i = 0; i < num; i++) {
+      out[i]->mutable_data<T>(ctx.GetPlace());
+      out_descs.emplace_back(MLUCnnlTensorDesc(new_dims.size(), new_dims.data(),
+                                               ToCnnlDataType<T>()));
+      out_raw_descs.push_back(out_descs.back().get());
+      out_ptrs.push_back(GetBasePtr(out[i]));
+    }
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnl::Split(ctx, num, axis, x_desc.get(), GetBasePtr(x),
+                   out_raw_descs.data(), out_ptrs.data());
+  }
+};
+
+template <typename T>
+class UnStackGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto x = ctx.MultiInput<Tensor>(framework::GradVarName("Y"));
+    auto *y = ctx.Output<Tensor>(framework::GradVarName("X"));
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis += (x[0]->dims().size() + 1);
+    int num = static_cast<int>(x.size());
+
+    std::vector<MLUCnnlTensorDesc> x_descs;
+    std::vector<cnnlTensorDescriptor_t> x_raw_descs;
+    std::vector<const void *> x_ptrs;
+    for (int i = 0; i < num; i++) {
+      if (x[i]->dims().size() != 0) {
+        std::vector<int64_t> in_dims = phi::vectorize(x[i]->dims());
+        in_dims.insert(in_dims.begin() + axis, 1);
+        x_descs.emplace_back(MLUCnnlTensorDesc(in_dims.size(), in_dims.data(),
+                                               ToCnnlDataType<T>()));
+      } else {
+        int input_dims = 1;
+        x_descs.emplace_back(
+            MLUCnnlTensorDesc(1, &input_dims, ToCnnlDataType<T>()));
+      }
+      x_raw_descs.push_back(x_descs.back().get());
+      x_ptrs.push_back(GetBasePtr(x[i]));
+    }
+    y->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc y_desc(*y);
+    MLUCnnl::Concat(ctx, num, axis, x_raw_descs.data(), x_ptrs.data(),
+                    y_desc.get(), GetBasePtr(y));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(unstack, ops::UnStackMLUKernel<float>,
+                       ops::UnStackMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(unstack_grad, ops::UnStackGradMLUKernel<float>,
+                       ops::UnStackGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 12fa933701ef46..13b5005a30fa05 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -1057,6 +1057,14 @@ class ReorderMKLDNNHandler {
     return std::make_shared<dnnl::reorder>(*(src_memory_p), *(dst_memory_p));
   }
 
+  std::shared_ptr<dnnl::reorder> AcquireReorder(
+      std::shared_ptr<dnnl::memory> dst_memory_p,
+      std::shared_ptr<dnnl::memory> src_memory_p,
+      const dnnl::primitive_attr& attrs) {
+    return std::make_shared<dnnl::reorder>(*(src_memory_p), *(dst_memory_p),
+                                           attrs);
+  }
+
  private:
   std::vector<int64_t> dims_;
   framework::proto::VarType::Type vtype_, vtype_dst_;
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index 5e2274cb651385..5aac6ada05b18a 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -298,6 +298,8 @@ void BindDataset(py::module *m) {
            py::call_guard<py::gil_scoped_release>())
       .def("set_preload_thread_num", &framework::Dataset::SetPreLoadThreadNum,
            py::call_guard<py::gil_scoped_release>())
+      .def("set_graph_device_keys", &framework::Dataset::SetGraphDeviceKeys,
+           py::call_guard<py::gil_scoped_release>())
       .def("create_preload_readers", &framework::Dataset::CreatePreLoadReaders,
            py::call_guard<py::gil_scoped_release>())
       .def("destroy_preload_readers",
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 4ffb513671c565..e2f4feebf9e3a5 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -178,13 +178,13 @@ void BindHeterClient(py::module* m) {
 void BindGraphNode(py::module* m) {
   py::class_<GraphNode>(*m, "GraphNode")
       .def(py::init<>())
-      .def("get_id", &GraphNode::get_id)
+      .def("get_id", &GraphNode::get_py_id)
       .def("get_feature", &GraphNode::get_feature);
 }
 void BindGraphPyFeatureNode(py::module* m) {
   py::class_<FeatureNode>(*m, "FeatureNode")
       .def(py::init<>())
-      .def("get_id", &GraphNode::get_id)
+      .def("get_id", &GraphNode::get_py_id)
       .def("get_feature", &GraphNode::get_feature);
 }
 
@@ -336,17 +336,27 @@ void BindGraphGpuWrapper(py::module* m) {
       *m, "GraphGpuWrapper")
       .def(py::init([]() { return GraphGpuWrapper::GetInstance(); }))
       .def("neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample_v3)
-      .def("graph_neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample)
+      .def("graph_neighbor_sample",
+           py::overload_cast<int, uint64_t*, int, int>(
+               &GraphGpuWrapper::graph_neighbor_sample))
+      .def("graph_neighbor_sample",
+           py::overload_cast<int, int, std::vector<uint64_t>&, int>(
+               &GraphGpuWrapper::graph_neighbor_sample))
       .def("set_device", &GraphGpuWrapper::set_device)
+      .def("set_feature_separator", &GraphGpuWrapper::set_feature_separator)
       .def("init_service", &GraphGpuWrapper::init_service)
       .def("set_up_types", &GraphGpuWrapper::set_up_types)
       .def("query_node_list", &GraphGpuWrapper::query_node_list)
       .def("add_table_feat_conf", &GraphGpuWrapper::add_table_feat_conf)
       .def("load_edge_file", &GraphGpuWrapper::load_edge_file)
-      .def("upload_batch", &GraphGpuWrapper::upload_batch)
-      .def("get_all_id", &GraphGpuWrapper::get_all_id)
-      .def("init_sample_status", &GraphGpuWrapper::init_sample_status)
-      .def("free_sample_status", &GraphGpuWrapper::free_sample_status)
+      .def("upload_batch",
+           py::overload_cast<int, std::vector<std::vector<uint64_t>>&>(
+               &GraphGpuWrapper::upload_batch))
+      .def("upload_batch",
+           py::overload_cast<std::vector<std::vector<uint64_t>>&, int>(
+               &GraphGpuWrapper::upload_batch))
+      .def("get_all_id", py::overload_cast<int, int, int>(&GraphGpuWrapper::get_all_id))
+      .def("get_all_id", py::overload_cast<int, int>(&GraphGpuWrapper::get_all_id))
       .def("load_next_partition", &GraphGpuWrapper::load_next_partition)
       .def("make_partitions", &GraphGpuWrapper::make_partitions)
       .def("make_complementary_graph",
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 76e617c7dafcf3..6112a9a1f45b6b 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -375,12 +375,12 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                 if attrs['use_ps_gpu']:
                     _program.global_block()._insert_op(
                         index=distributed_idx,
-                        type="pull_box_sparse",
+                        type="pull_gpups_sparse",
                         inputs={"Ids": inputs,
                                 'W': w},
                         outputs={"Out": outputs},
                         attrs={
-                            "size": w.shape[1],
+                            "size": [w.shape[1] for i in inputs],
                             "is_distributed": True,
                             "is_sparse": True
                         })
@@ -614,15 +614,24 @@ def _check_conflict(self, other_pass):
         return True
 
     def _add_push_box_sparse_op(self, program):
+        insert_index = -1
+        for idx, op in list(enumerate(program.global_block().ops)):
+            if op.type == "lookup_table_grad":
+                insert_index = idx
         for op in program.global_block().ops:
-            if op.type != "pull_box_sparse":
+            if op.type != "pull_box_sparse" and op.type != "pull_gpups_sparse":
                 continue
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(set()), [])
             for op_desc in grad_op_desc:
-                new_op_desc = program.global_block().desc.append_op()
+                new_op_desc = program.global_block().desc._insert_op(
+                    insert_index + 1)
                 new_op_desc.copy_from(op_desc)
                 new_op_desc._set_attr(op_role_attr_name, backward)
+                new_op = paddle.fluid.framework.Operator(program.global_block(),
+                                                         new_op_desc)
+                program.global_block().ops.insert(insert_index + 1, new_op)
+                program.global_block()._sync_with_cpp()
 
     def _remove_optimizer_var(self, program):
         embedding_w = {}
@@ -670,7 +679,7 @@ def _remove_lookup_table_grad_op_and_var(self, program):
                     lookup_table_grad_var[name] = 1
 
         for idx, op in list(enumerate(program.global_block().ops)):
-            if op.type == "pull_box_sparse":
+            if op.type == "pull_box_sparse" or op.type == "pull_gpups_sparse":
                 continue
             for key_name in op.input_names:
                 for var in op.input(key_name):
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index c6df7559a22e81..888d517116a15f 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -1013,12 +1013,13 @@ def sync_strategy_envs():
             if self.context['ps_mode'] == DistributedMode.GEO:
                 self._communicator.init_params(init_params)
             else:
-                if role_id == 0:
-                    self._init_all_params(scopes, send_ctx, dense_map)
+                if not self.context['use_ps_gpu']:
+                    if role_id == 0:
+                        self._init_all_params(scopes, send_ctx, dense_map)
 
             fleet.util.barrier()
-
-        self._pull_all_dense(scopes, send_ctx, dense_map)
+        if not self.context['use_ps_gpu']:
+            self._pull_all_dense(scopes, send_ctx, dense_map)
         fleet.util.barrier()
 
         if self.context['ps_mode'] == DistributedMode.GEO:
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index c73ea8b5b0e1a6..55b44309ff71a3 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -901,7 +901,7 @@ def shuffle_batch(x, seed=None):
         seed = helper.create_variable(
             name=unique_name.generate("shuffle_batch_seed"),
             dtype="int64",
-            persistable=True)
+            persistable=False)
     helper.append_op(
         type='shuffle_batch',
         inputs={'X': x,
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index 84064669c0dc67..70c7c0fb8c4382 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -1042,6 +1042,27 @@ def _set_heter_ps(self, enable_heter_ps=False):
         """
         self.dataset.set_heter_ps(enable_heter_ps)
 
+    def set_graph_device_keys(self, device_keys):
+        """
+        """
+        self.dataset.set_graph_device_keys(device_keys)
+
+    def set_graph_config(self, config):
+        """
+        """
+        self.proto_desc.graph_config.walk_degree = config.get("walk_degree", 1)
+        self.proto_desc.graph_config.walk_len = config.get("walk_len", 20)
+        self.proto_desc.graph_config.window = config.get("window", 5)
+        self.proto_desc.graph_config.once_sample_startid_len = config.get(
+            "once_sample_startid_len", 8000)
+        self.proto_desc.graph_config.sample_times_one_chunk = config.get(
+            "sample_times_one_chunk", 10)
+        self.proto_desc.graph_config.batch_size = config.get("batch_size", 1)
+        self.proto_desc.graph_config.debug_mode = config.get("debug_mode", 0)
+        self.proto_desc.graph_config.first_node_type = config.get(
+            "first_node_type", "")
+        self.proto_desc.graph_config.meta_path = config.get("meta_path", "")
+
 
 class QueueDataset(DatasetBase):
     """
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 2c09abac9e7ba8..51e89cc301cf30 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -293,12 +293,12 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                 if use_ps_gpu:
                     program.global_block()._insert_op(
                         index=distributed_idx,
-                        type="pull_box_sparse",
+                        type="pull_gpups_sparse",
                         inputs={"Ids": inputs,
                                 'W': w},
                         outputs={"Out": outputs},
                         attrs={
-                            "size": w.shape[1],
+                            "size": [w.shape[1] for i in inputs],
                             "is_distributed": True,
                             "is_sparse": True
                         })
@@ -576,7 +576,7 @@ def _add_push_box_sparse_op(program):
         op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
         backward = core.op_proto_and_checker_maker.OpRole.Backward
         for op in program.global_block().ops:
-            if op.type != "pull_box_sparse":
+            if op.type != "pull_box_sparse" and op.type != "pull_gpups_sparse":
                 continue
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(set()), [])
@@ -599,7 +599,7 @@ def _remove_lookup_table_grad_op_and_var(program):
                     lookup_table_grad_var[name] = 1
 
         for idx, op in list(enumerate(program.global_block().ops)):
-            if op.type == "pull_box_sparse":
+            if op.type == "pull_box_sparse" or op.type == "pull_gpups_sparse":
                 continue
             for key_name in op.input_names:
                 for var in op.input(key_name):
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index 40ff41fe89f47f..dd9d7e760a8e5e 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -103,9 +103,9 @@ def init_worker(self):
             # prepare for client to client communication
             if self._role_maker.is_worker():
                 info = self._fleet_ptr.get_clients_info()
-                print("IIIIFO: {}".format(info))
+                print("Client Info: {}".format(info))
                 all_info = self._role_maker._worker_gather(info[0])
-                print("ALL info: {}".format(all_info))
+                print("All Client Info: {}".format(all_info))
                 self._fleet_ptr.gather_clients(all_info)
                 self._fleet_ptr.set_client2client_config(
                     self._client2client_request_timeout_ms,
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
index 8dfe9c32cd9734..5f0af296441fff 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
@@ -124,14 +124,15 @@ def add_sparse_table(self, table_id, strategy):
 
             support_accessor_class = [
                 'DownpourFeatureValueAccessor', 'DownpourCtrAccessor',
-                'DownpourSparseValueAccessor', 'DownpourCtrDoubleAccessor',
-                'DownpourUnitAccessor', 'DownpourDoubleUnitAccessor'
+                'DownpourCtrDymfAccessor', 'DownpourSparseValueAccessor',
+                'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor',
+                'DownpourDoubleUnitAccessor'
             ]
             if strategy.get('sparse_accessor_class') is not None:
                 accessor_class = strategy.get('sparse_accessor_class')
                 if accessor_class not in support_accessor_class:
                     raise ValueError(
-                        "support sparse_accessor_class: ['DownpourFeatureValueAccessor', 'DownpourCtrAccessor', \
+                        "support sparse_accessor_class: ['DownpourFeatureValueAccessor', 'DownpourCtrAccessor', 'DownpourCtrDymfAccessor', \
                         'DownpourSparseValueAccessor', 'DownpourCtrDoubleAccessor'], \
                             but actual %s" % (accessor_class))
             else:
@@ -141,6 +142,7 @@ def add_sparse_table(self, table_id, strategy):
 
             if accessor_class == 'DownpourFeatureValueAccessor' \
                     or accessor_class == 'DownpourCtrAccessor' \
+                    or accessor_class == 'DownpourCtrDymfAccessor' \
                     or accessor_class == 'DownpourCtrDoubleAccessor':
                 table.accessor.sparse_sgd_param.learning_rate = strategy.get(
                     'sparse_learning_rate', 0.05)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index 5d7dacc007e6b7..9483556d46f59c 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -339,6 +339,7 @@ def _check_config_fleet_with_program_op(self, strategy, table_name,
         # set sparse_embedx_dim in the strategy according to accessor and use_cvm config
         if accessor == "DownpourFeatureValueAccessor" \
                 or accessor == "DownpourCtrAccessor" \
+                or accessor == "DownpourCtrDymfAccessor" \
                 or accessor == "DownpourDoubleUnitAccessor" \
                 or accessor == "DownpourUnitAccessor":
             if st.get("sparse_embedx_dim") is not None \
@@ -586,6 +587,7 @@ def _minimize(self,
                 # set sparse_embedx_dim in strategy,
                 # user do not have to set it in config_fleet
                 if accessor == "DownpourFeatureValueAccessor" \
+                        or accessor == "DownpourCtrDymfAccessor" \
                         or accessor == "DownpourCtrAccessor" \
                         or accessor == "DownpourDoubleUnitAccessor" \
                         or accessor == "DownpourUnitAccessor":
@@ -873,7 +875,8 @@ def _minimize(self,
         if server._server.downpour_server_param.downpour_table_param[
                 0].accessor.accessor_class in [
                     "DownpourCtrAccessor", "DownpourCtrDoubleAccessor",
-                    "DownpourUnitAccessor", "DownpourDoubleUnitAccessor"
+                    "DownpourUnitAccessor", "DownpourDoubleUnitAccessor",
+                    "DownpourCtrDymfAccessor"
                 ]:
             opt_info["dump_slot"] = True
         elif server._server.downpour_server_param.downpour_table_param[
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 799d93918f2efd..97506ead5fad4d 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -737,7 +737,7 @@ def _pull_gpups_sparse(input,
         for i in range(len(inputs))
     ]
     w = helper.create_parameter(
-        attr=helper.param_attr, shape=[11], dtype=dtype, is_bias=False)
+        attr=helper.param_attr, shape=[size[0]], dtype=dtype, is_bias=False)
     helper.append_op(
         type='pull_gpups_sparse',
         inputs={'Ids': inputs,
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
index a0836c959c84b9..fae52ab833b9d4 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+import paddle
 
 
 class TestDeQuantizeOp(OpTest):
@@ -110,19 +111,6 @@ def set_data_type(self):
         self.data_type = 'uint16'
 
 
-class TestDeQuantizeOp_ZeroScale(TestDeQuantizeOp):
-    def set_scale(self):
-        self.scale = 0.0
-
-    def prepare_output_int8(self):
-        self.output = np.zeros(self.input_size)
-        self.outputs = {'Output': self.output}
-
-    def test_check_output(self):
-        self.assertRaises(AttributeError, self.check_raise_error,
-                          'Dequantization scale cannot be 0.0')
-
-
 # 2-dim input
 # P - positive input, with shift
 class TestDeQuantizeOpShift_2_P(TestDeQuantizeOp):
@@ -177,28 +165,6 @@ def set_input_size(self):
         self.input_size = [2, 3, 4, 5]
 
 
-class TestDeQuantizeOp_NegativeShift(TestDeQuantizeOp):
-    def set_shift(self):
-        self.shift = -10.0
-
-    def prepare_output_int8(self):
-        self.output = np.zeros(self.input_size)
-        self.outputs = {'Output': self.output}
-
-    def test_check_output(self):
-        self.assertRaises(AttributeError, self.check_raise_error,
-                          'Dequantization shift must be nonnegative.')
-
-
-class TestDeQuantizeOp_TooBigShift(TestDeQuantizeOp_NegativeShift):
-    def set_shift(self):
-        self.shift = 300.0
-
-    def test_check_output(self):
-        self.assertRaises(
-            AttributeError, self.check_raise_error,
-            'Dequantization shift must be less than or equal to 255.')
-
-
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
index a7acc5f3f9bf32..c92d870565fbc9 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 from paddle.fluid.tests.unittests.op_test import OpTest
+import paddle
 
 
 class TestQuantizeOp(OpTest):
@@ -104,19 +105,6 @@ def set_is_negative(self):
         self.is_nagative = False
 
 
-class TestQuantizeOp_ZeroScale(TestQuantizeOp):
-    def set_scale(self):
-        self.scale = 0.0
-
-    def prepare_output(self):
-        self.output = np.zeros(self.input_size)
-        self.outputs = {'Output': self.output}
-
-    def test_check_output(self):
-        self.assertRaises(AttributeError, self.check_raise_error,
-                          'Quantization scale cannot be 0.0')
-
-
 # 2-dim input
 # P - positive input
 class TestQuantizeOpShift_NCHW_2_P(TestQuantizeOp):
@@ -201,34 +189,6 @@ def set_output_format(self):
         self.output_format = 'NHWC'
 
 
-class TestQuantizeOp_NegativeShift(TestQuantizeOp):
-    def set_is_negative(self):
-        self.is_nagative = False
-
-    def set_scale(self):
-        self.scale = 100.0
-
-    def set_shift(self):
-        self.shift = -10.0
-
-    def prepare_output(self):
-        self.output = np.zeros(self.input_size)
-        self.outputs = {'Output': self.output}
-
-    def test_check_output(self):
-        self.assertRaises(AttributeError, self.check_raise_error,
-                          'Quantization shift must be nonnegative.')
-
-
-class TestQuantizeOp_TooBigShift(TestQuantizeOp_NegativeShift):
-    def set_shift(self):
-        self.shift = 300.0
-
-    def test_check_output(self):
-        self.assertRaises(
-            AttributeError, self.check_raise_error,
-            'Quantization shift must be less than or equal to 255.')
-
-
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py
new file mode 100644
index 00000000000000..f9a08ba4c9b146
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py
@@ -0,0 +1,142 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2022
+
+
+class TestLookupTableV2(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "lookup_table_v2"
+
+        self.init_dtype()
+        self.init_dims()
+        self.init_padding_idx()
+        np.random.seed(SEED)
+        w = np.random.random([self.vocab, self.dim]).astype(self.dtype)
+        x = np.random.randint(
+            0, self.vocab, size=(self.bsz, self.seqlen)).astype(self.ids_dtype)
+        out = w[x]
+        if self.padding_idx != -1:
+            out[np.squeeze(x == self.padding_idx)] = np.zeros(self.dim)
+
+        self.inputs = {
+            'W': OpTest.np_dtype_to_fluid_dtype(w),
+            'Ids': OpTest.np_dtype_to_fluid_dtype(x)
+        }
+        self.attrs = {
+            'is_sparse': False,
+            'is_distributed': False,
+            'remote_prefetch': False,
+            'padding_idx': self.padding_idx
+        }
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+        self.ids_dtype = np.int32
+
+    def init_dims(self):
+        self.bsz = 6
+        self.seqlen = 8
+        self.vocab = 10
+        # embedding_dim is not multiple of 32
+        self.dim = 20
+
+    def init_padding_idx(self):
+        self.padding_idx = -1
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place, ['W'], 'Out', max_relative_error=0.01)
+        else:
+            self.check_grad_with_place(self.place, ['W'], 'Out')
+
+
+class TestLookupTableV2FP16(TestLookupTableV2):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+        self.ids_dtype = np.int32
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.no_need_check_grad = True
+
+
+class TestLookupTableV2Dim32(TestLookupTableV2):
+    def init_dims(self):
+        self.bsz = 6
+        self.seqlen = 8
+        self.vocab = 10
+        # embedding_dim is multiple of 32
+        self.dim = 64
+
+
+class TestLookupTableV2Dim32FP16(TestLookupTableV2):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+        self.ids_dtype = np.int64
+
+    def init_dims(self):
+        self.bsz = 6
+        self.seqlen = 8
+        self.vocab = 10
+        self.dim = 64
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.no_need_check_grad = True
+
+
+class TestLookupTableV2WithPadding(TestLookupTableV2):
+    def init_padding_idx(self):
+        self.padding_idx = np.random.randint(0, self.vocab)
+
+
+class TestLookupTableV2WithPadding1(TestLookupTableV2):
+    def init_padding_idx(self):
+        self.padding_idx = np.random.randint(0, self.vocab)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+        self.ids_dtype = np.int64
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_unstack_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_unstack_op_mlu.py
new file mode 100644
index 00000000000000..a75a6aa1dfcb92
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_unstack_op_mlu.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import unittest
+import paddle
+
+paddle.enable_static()
+
+
+class TestUnStackOpBase(OpTest):
+    def initDefaultParameters(self):
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+
+    def initParameters(self):
+        pass
+
+    def get_y_names(self):
+        y_names = []
+        for i in range(self.input_dim[self.axis]):
+            y_names.append('y{}'.format(i))
+        return y_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        self.op_type = 'unstack'
+        self.set_mlu()
+        self.init_dtype()
+
+        self.x = np.random.random(size=self.input_dim).astype(self.dtype)
+
+        outs = np.split(self.x, self.input_dim[self.axis], self.axis)
+        new_shape = list(self.input_dim)
+        del new_shape[self.axis]
+        y_names = self.get_y_names()
+        tmp = []
+        for i in range(self.input_dim[self.axis]):
+            tmp.append((y_names[i], np.reshape(outs[i], new_shape)))
+
+        self.inputs = {'X': self.x}
+        self.outputs = {'Y': tmp}
+        self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], self.get_y_names())
+
+
+class TestStackOp3(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = -1
+
+
+class TestStackOp4(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = -3
+
+
+class TestStackOp5(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = 1
+
+
+class TestStackOp6(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = 2
+
+
+if __name__ == '__main__':
+    unittest.main()