diff --git a/AUTHORS.md b/AUTHORS.md
index 71d028fac3691..59f6a8ebb5f1c 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -1,6 +1,9 @@
 | Github account | name |
 |---|---|
 | abhinavarora | Abhinav Arora |
+| andreazanetti | Andrea Zanetti |
+| arlesniak | Artur Lesniak |
+| arogowie-intel | Adam Osewski |
 | backyes | Yan-Fei Wang |
 | baiyfbupt | Yi-Fan Bai |
 | beckett1124 | Bin Qi |
@@ -8,6 +11,7 @@
 | chengxiaohua1105 | Xiao-Hua Cheng |
 | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | cxysteven | Xing-Yi Cheng |
+| ddokupil | Dariusz Dokupil |
 | dzhwinter | Zhi-Hong Dong |
 | dragonwarrior | Long Wang |
 | dyning | Yuning Du |
@@ -21,6 +25,7 @@
 | hedaoyuan | Dao-Yuan He |
 | helinwang | He-Lin Wang |
 | jacquesqiao | Long-Fei Qiao |
+| jakpiase | Jakub Piasecki |
 | [jczaja](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg) | Jacek Czaja |
 | JiayiFeng | Jia-Yi Feng |
 | kbinias | Krzysztof Binias |
@@ -42,6 +47,7 @@
 | pakchoi | Chuan-Jiang Song |
 | panyx0718 | Xin Pan |
 | pengli09 | Peng Li |
+| pmajchrzak |Piotr Majchrzak |
 | pkuyym | Ya-Ming Yang |
 | pzelazko-intel | Pawel Zelazko |
 | [pawelpiotrowicz](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)  | Pawel Piotrowicz |
diff --git a/README.md b/README.md
index 8b437e4115abe..d0a35332d474e 100644
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ We provide [English](https://www.paddlepaddle.org.cn/documentation/docs/en/guide
 ## Communication
 
 - [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc.
-- QQ discussion group: 778260830 (PaddlePaddle).
+- QQ discussion group: 793866180 (PaddlePaddle).
 - [Forums](https://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
 
 ## Copyright and License
diff --git a/README_cn.md b/README_cn.md
index 7a10cba284549..2be8be3df6e7b 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -83,7 +83,7 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
 ## 交流与反馈
 
 - 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
-- QQ群: 778260830 (PaddlePaddle)
+- QQ群: 793866180 (PaddlePaddle)
 - [论坛](https://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
 
 ## 版权和许可证
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index fb1d4d9d56dcc..4e0768fc10f03 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,8 @@ SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     ${GIT_URL}/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            f58682cd8bd0615f41d879f8afc8f1511ab42d24)
+SET(MKLDNN_TAG            f3999b71d8e4415c1985a0dfb812a3ed77ee21fa)
+
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 100b915339469..c591a9391dfa5 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -78,6 +78,21 @@ if(WITH_ASCEND OR WITH_ASCEND_CL)
                          -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
     )
 else()
+    if(WIN32)
+        set(WARPCTC_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_C_FLAGS_DEBUG $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_C_FLAGS_RELEASE $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_CXX_FLAGS_RELEASE $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_CXX_FLAGS_DEBUG $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+    else()
+        set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
+        set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+        set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+        set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+        set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+    endif()
     ExternalProject_Add(
         extern_warpctc
         ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -90,12 +105,12 @@ else()
         BUILD_ALWAYS    1
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_C_FLAGS=$<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>
-                        -DCMAKE_C_FLAGS_DEBUG=$<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
-                        -DCMAKE_C_FLAGS_RELEASE=$<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
-                        -DCMAKE_CXX_FLAGS=$<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>
-                        -DCMAKE_CXX_FLAGS_RELEASE=$<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
-                        -DCMAKE_CXX_FLAGS_DEBUG=$<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
+                        -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
+                        -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG}
+                        -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG}
                         -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
                         -DWITH_GPU=${WITH_GPU}
                         -DWITH_ROCM=${WITH_ROCM}
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index f846623602ed7..a03ff7d22dcad 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -13,7 +13,7 @@ if(NOT XPU_SDK_ROOT)
   elseif(WITH_SUNWAY)
       SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
   else()
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_04_09.tar.gz" CACHE STRING "" FORCE)
+      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_05_19.tar.gz" CACHE STRING "" FORCE)
   endif()
 
   SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index a5c74a46631e9..53dcde616b261 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -391,7 +391,7 @@ function(cc_binary TARGET_NAME)
 endfunction(cc_binary)
 
 function(cc_test_build TARGET_NAME)
-  if(WITH_TESTING)
+  if(WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -409,14 +409,12 @@ function(cc_test_build TARGET_NAME)
     if(WITH_ROCM)
       target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
     endif()
+    check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS})
   endif()
-
-  check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS})
-
 endfunction()
 
 function(cc_test_run TARGET_NAME)
-  if(WITH_TESTING)
+  if(WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
     set(oneValueArgs "")
     set(multiValueArgs COMMAND ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
diff --git a/cmake/init.cmake b/cmake/init.cmake
index b11156d2e9986..4bdcaeb4c5f3c 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -18,6 +18,16 @@ if(NOT WIN32)
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
 else()
+    set(CMAKE_C_FLAGS_DEBUG "/Zi /DEBUG")
+    set(CMAKE_C_FLAGS_RELEASE "/O2 /DNDEBUG")
+    set(CMAKE_C_FLAGS_RELWITHDEBINFO "/O2 /DNDEBUG")
+    set(CMAKE_C_FLAGS_MINSIZEREL "/Os /DNDEBUG")
+
+    set(CMAKE_CXX_FLAGS_DEBUG "/Zi /DEBUG")
+    set(CMAKE_CXX_FLAGS_RELEASE "/O2 /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/O2 /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_MINSIZEREL "/Os /DNDEBUG")
+
     # It can specify CUDA compile flag manualy,
     # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
     # because CUDA will update by nvidia, then error will occur.
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 75b1100caa915..33390745cc8c9 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -44,6 +44,9 @@ function(op_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
                 list(APPEND cu_srcs ${TARGET}.cu)
             endif()
+            if (WITH_NV_JETSON)
+                list(REMOVE_ITEM cu_srcs "decode_jpeg_op.cu")
+            endif()
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
                 set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
                         ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index f90fa3509d63d..56edaff2a50da 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -261,6 +261,14 @@ if(WITH_PSLIB)
     if(WITH_PSLIB_BRPC)
         include(external/pslib_brpc) # download, build, install pslib_brpc
         list(APPEND third_party_deps extern_pslib_brpc)
+    else()    
+        include(external/snappy)
+        list(APPEND third_party_deps extern_snappy)
+
+        include(external/leveldb)
+        list(APPEND third_party_deps extern_leveldb)
+        include(external/brpc)
+        list(APPEND third_party_deps extern_brpc)
     endif()
 endif(WITH_PSLIB)
 
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc
index eafb4d596cc16..70f2da6d7252c 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_client.cc
@@ -80,11 +80,11 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
       [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) {
         int ret = 0;
         auto *closure = (DownpourBrpcClosure *)done;
-        int fail_num = 0;
+        size_t fail_num = 0;
         for (int request_idx = 0; request_idx < request_call_num;
              ++request_idx) {
-          if (closure->check_response(request_idx,
-                                      PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) {
+          if (closure->check_response(request_idx, PS_GRAPH_GET_NODE_FEAT) !=
+              0) {
             ++fail_num;
           } else {
             auto &res_io_buffer =
@@ -144,6 +144,163 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
 
   return fut;
 }
+
+std::future<int32_t> GraphBrpcClient::clear_nodes(uint32_t table_id) {
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      server_size, [&, server_size = this->server_size ](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        size_t fail_num = 0;
+        for (size_t request_idx = 0; request_idx < server_size; ++request_idx) {
+          if (closure->check_response(request_idx, PS_GRAPH_CLEAR) != 0) {
+            ++fail_num;
+            break;
+          }
+        }
+        ret = fail_num == 0 ? 0 : -1;
+        closure->set_promise_value(ret);
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  for (size_t i = 0; i < server_size; i++) {
+    int server_index = i;
+    closure->request(server_index)->set_cmd_id(PS_GRAPH_CLEAR);
+    closure->request(server_index)->set_table_id(table_id);
+    closure->request(server_index)->set_client_id(_client_id);
+
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(server_index)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(server_index),
+                     closure->request(server_index),
+                     closure->response(server_index), closure);
+  }
+  return fut;
+}
+std::future<int32_t> GraphBrpcClient::add_graph_node(
+    uint32_t table_id, std::vector<uint64_t> &node_id_list,
+    std::vector<bool> &is_weighted_list) {
+  std::vector<std::vector<uint64_t>> request_bucket;
+  std::vector<std::vector<bool>> is_weighted_bucket;
+  bool add_weight = is_weighted_list.size() > 0;
+  std::vector<int> server_index_arr;
+  std::vector<int> index_mapping(server_size, -1);
+  for (size_t query_idx = 0; query_idx < node_id_list.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_id_list[query_idx]);
+    if (index_mapping[server_index] == -1) {
+      index_mapping[server_index] = request_bucket.size();
+      server_index_arr.push_back(server_index);
+      request_bucket.push_back(std::vector<uint64_t>());
+      if (add_weight) is_weighted_bucket.push_back(std::vector<bool>());
+    }
+    request_bucket[index_mapping[server_index]].push_back(
+        node_id_list[query_idx]);
+    if (add_weight)
+      is_weighted_bucket[index_mapping[server_index]].push_back(
+          query_idx < is_weighted_list.size() ? is_weighted_list[query_idx]
+                                              : false);
+  }
+  size_t request_call_num = request_bucket.size();
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [&, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        size_t fail_num = 0;
+        for (size_t request_idx = 0; request_idx < request_call_num;
+             ++request_idx) {
+          if (closure->check_response(request_idx, PS_GRAPH_ADD_GRAPH_NODE) !=
+              0) {
+            ++fail_num;
+          }
+        }
+        ret = fail_num == request_call_num ? -1 : 0;
+        closure->set_promise_value(ret);
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+
+  for (size_t request_idx = 0; request_idx < request_call_num; ++request_idx) {
+    int server_index = server_index_arr[request_idx];
+    closure->request(request_idx)->set_cmd_id(PS_GRAPH_ADD_GRAPH_NODE);
+    closure->request(request_idx)->set_table_id(table_id);
+    closure->request(request_idx)->set_client_id(_client_id);
+    size_t node_num = request_bucket[request_idx].size();
+    closure->request(request_idx)
+        ->add_params((char *)request_bucket[request_idx].data(),
+                     sizeof(uint64_t) * node_num);
+    if (add_weight) {
+      bool weighted[is_weighted_bucket[request_idx].size() + 1];
+      for (size_t j = 0; j < is_weighted_bucket[request_idx].size(); j++)
+        weighted[j] = is_weighted_bucket[request_idx][j];
+      closure->request(request_idx)
+          ->add_params((char *)weighted,
+                       sizeof(bool) * is_weighted_bucket[request_idx].size());
+    }
+    // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
+                     closure->response(request_idx), closure);
+  }
+  return fut;
+}
+std::future<int32_t> GraphBrpcClient::remove_graph_node(
+    uint32_t table_id, std::vector<uint64_t> &node_id_list) {
+  std::vector<std::vector<uint64_t>> request_bucket;
+  std::vector<int> server_index_arr;
+  std::vector<int> index_mapping(server_size, -1);
+  for (size_t query_idx = 0; query_idx < node_id_list.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_id_list[query_idx]);
+    if (index_mapping[server_index] == -1) {
+      index_mapping[server_index] = request_bucket.size();
+      server_index_arr.push_back(server_index);
+      request_bucket.push_back(std::vector<uint64_t>());
+    }
+    request_bucket[index_mapping[server_index]].push_back(
+        node_id_list[query_idx]);
+  }
+  size_t request_call_num = request_bucket.size();
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [&, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        int fail_num = 0;
+        for (size_t request_idx = 0; request_idx < request_call_num;
+             ++request_idx) {
+          if (closure->check_response(request_idx,
+                                      PS_GRAPH_REMOVE_GRAPH_NODE) != 0) {
+            ++fail_num;
+          }
+        }
+        ret = fail_num == request_call_num ? -1 : 0;
+        closure->set_promise_value(ret);
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+
+  for (size_t request_idx = 0; request_idx < request_call_num; ++request_idx) {
+    int server_index = server_index_arr[request_idx];
+    closure->request(request_idx)->set_cmd_id(PS_GRAPH_REMOVE_GRAPH_NODE);
+    closure->request(request_idx)->set_table_id(table_id);
+    closure->request(request_idx)->set_client_id(_client_id);
+    size_t node_num = request_bucket[request_idx].size();
+
+    closure->request(request_idx)
+        ->add_params((char *)request_bucket[request_idx].data(),
+                     sizeof(uint64_t) * node_num);
+    // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
+                     closure->response(request_idx), closure);
+  }
+  return fut;
+}
 // char* &buffer,int &actual_size
 std::future<int32_t> GraphBrpcClient::batch_sample_neighboors(
     uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
@@ -174,8 +331,8 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighboors(
       [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) {
         int ret = 0;
         auto *closure = (DownpourBrpcClosure *)done;
-        int fail_num = 0;
-        for (int request_idx = 0; request_idx < request_call_num;
+        size_t fail_num = 0;
+        for (size_t request_idx = 0; request_idx < request_call_num;
              ++request_idx) {
           if (closure->check_response(request_idx,
                                       PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) {
@@ -254,13 +411,14 @@ std::future<int32_t> GraphBrpcClient::random_sample_nodes(
       auto &res_io_buffer = closure->cntl(0)->response_attachment();
       butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
       size_t bytes_size = io_buffer_itr.bytes_left();
-      char buffer[bytes_size];
+      char *buffer = new char[bytes_size];
       auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
       int index = 0;
       while (index < bytes_size) {
         ids.push_back(*(uint64_t *)(buffer + index));
         index += GraphNode::id_size;
       }
+      delete[] buffer;
     }
     closure->set_promise_value(ret);
   });
@@ -292,7 +450,7 @@ std::future<int32_t> GraphBrpcClient::pull_graph_list(
       auto &res_io_buffer = closure->cntl(0)->response_attachment();
       butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
       size_t bytes_size = io_buffer_itr.bytes_left();
-      char buffer[bytes_size];
+      char *buffer = new char[bytes_size];
       io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
       int index = 0;
       while (index < bytes_size) {
@@ -301,6 +459,7 @@ std::future<int32_t> GraphBrpcClient::pull_graph_list(
         index += node.get_size(false);
         res.push_back(node);
       }
+      delete buffer;
     }
     closure->set_promise_value(ret);
   });
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h
index 4e6775a4bedaf..5696e8b08037b 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.h
+++ b/paddle/fluid/distributed/service/graph_brpc_client.h
@@ -78,6 +78,13 @@ class GraphBrpcClient : public BrpcPsClient {
       const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
       const std::vector<std::string>& feature_names,
       std::vector<std::vector<std::string>>& res);
+
+  virtual std::future<int32_t> clear_nodes(uint32_t table_id);
+  virtual std::future<int32_t> add_graph_node(
+      uint32_t table_id, std::vector<uint64_t>& node_id_list,
+      std::vector<bool>& is_weighted_list);
+  virtual std::future<int32_t> remove_graph_node(
+      uint32_t table_id, std::vector<uint64_t>& node_id_list);
   virtual int32_t initialize();
   int get_shard_num() { return shard_num; }
   void set_shard_num(int shard_num) { this->shard_num = shard_num; }
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc
index bdd926278b624..52ac8c5d688a4 100644
--- a/paddle/fluid/distributed/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_server.cc
@@ -24,6 +24,14 @@
 namespace paddle {
 namespace distributed {
 
+#define CHECK_TABLE_EXIST(table, request, response)        \
+  if (table == NULL) {                                     \
+    std::string err_msg("table not found with table_id:"); \
+    err_msg.append(std::to_string(request.table_id()));    \
+    set_response_code(response, -1, err_msg.c_str());      \
+    return -1;                                             \
+  }
+
 int32_t GraphBrpcServer::initialize() {
   auto &service_config = _config.downpour_server_param().service_param();
   if (!service_config.has_service_class()) {
@@ -71,6 +79,58 @@ uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) {
   return 0;
 }
 
+int32_t GraphBrpcService::clear_nodes(Table *table,
+                                      const PsRequestMessage &request,
+                                      PsResponseMessage &response,
+                                      brpc::Controller *cntl) {
+  ((GraphTable *)table)->clear_nodes();
+  return 0;
+}
+
+int32_t GraphBrpcService::add_graph_node(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 1) {
+    set_response_code(
+        response, -1,
+        "graph_get_node_feat request requires at least 2 arguments");
+    return 0;
+  }
+
+  size_t node_num = request.params(0).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  std::vector<bool> is_weighted_list;
+  if (request.params_size() == 2) {
+    size_t weight_list_size = request.params(1).size() / sizeof(bool);
+    bool *is_weighted_buffer = (bool *)(request.params(1).c_str());
+    is_weighted_list = std::vector<bool>(is_weighted_buffer,
+                                         is_weighted_buffer + weight_list_size);
+  }
+
+  ((GraphTable *)table)->add_graph_node(node_ids, is_weighted_list);
+  return 0;
+}
+int32_t GraphBrpcService::remove_graph_node(Table *table,
+                                            const PsRequestMessage &request,
+                                            PsResponseMessage &response,
+                                            brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 1) {
+    set_response_code(
+        response, -1,
+        "graph_get_node_feat request requires at least 1 argument");
+    return 0;
+  }
+  size_t node_num = request.params(0).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+
+  ((GraphTable *)table)->remove_graph_node(node_ids);
+  return 0;
+}
 int32_t GraphBrpcServer::port() { return _server.listen_address().port; }
 
 int32_t GraphBrpcService::initialize() {
@@ -92,21 +152,17 @@ int32_t GraphBrpcService::initialize() {
       &GraphBrpcService::graph_random_sample_nodes;
   _service_handler_map[PS_GRAPH_GET_NODE_FEAT] =
       &GraphBrpcService::graph_get_node_feat;
-
+  _service_handler_map[PS_GRAPH_CLEAR] = &GraphBrpcService::clear_nodes;
+  _service_handler_map[PS_GRAPH_ADD_GRAPH_NODE] =
+      &GraphBrpcService::add_graph_node;
+  _service_handler_map[PS_GRAPH_REMOVE_GRAPH_NODE] =
+      &GraphBrpcService::remove_graph_node;
   // shard初始化,server启动后才可从env获取到server_list的shard信息
   initialize_shard_info();
 
   return 0;
 }
 
-#define CHECK_TABLE_EXIST(table, request, response)        \
-  if (table == NULL) {                                     \
-    std::string err_msg("table not found with table_id:"); \
-    err_msg.append(std::to_string(request.table_id()));    \
-    set_response_code(response, -1, err_msg.c_str());      \
-    return -1;                                             \
-  }
-
 int32_t GraphBrpcService::initialize_shard_info() {
   if (!_is_initialize_shard_info) {
     std::lock_guard<std::mutex> guard(_initialize_shard_mutex);
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h
index 32c572f9e6c2b..47c370572826a 100644
--- a/paddle/fluid/distributed/service/graph_brpc_server.h
+++ b/paddle/fluid/distributed/service/graph_brpc_server.h
@@ -86,6 +86,13 @@ class GraphBrpcService : public PsBaseService {
   int32_t graph_get_node_feat(Table *table, const PsRequestMessage &request,
                               PsResponseMessage &response,
                               brpc::Controller *cntl);
+  int32_t clear_nodes(Table *table, const PsRequestMessage &request,
+                      PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t add_graph_node(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t remove_graph_node(Table *table, const PsRequestMessage &request,
+                            PsResponseMessage &response,
+                            brpc::Controller *cntl);
   int32_t barrier(Table *table, const PsRequestMessage &request,
                   PsResponseMessage &response, brpc::Controller *cntl);
   int32_t load_one_table(Table *table, const PsRequestMessage &request,
diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/service/graph_py_service.cc
index 61e4e0cf7bb91..39befb1a112c8 100644
--- a/paddle/fluid/distributed/service/graph_py_service.cc
+++ b/paddle/fluid/distributed/service/graph_py_service.cc
@@ -44,6 +44,9 @@ void GraphPyService::add_table_feat_conf(std::string table_name,
   }
 }
 
+void add_graph_node(std::vector<uint64_t> node_ids,
+                    std::vector<bool> weight_list) {}
+void remove_graph_node(std::vector<uint64_t> node_ids) {}
 void GraphPyService::set_up(std::string ips_str, int shard_num,
                             std::vector<std::string> node_types,
                             std::vector<std::string> edge_types) {
@@ -247,6 +250,34 @@ void GraphPyClient::load_edge_file(std::string name, std::string filepath,
   }
 }
 
+void GraphPyClient::clear_nodes(std::string name) {
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status = get_ps_client()->clear_nodes(table_id);
+    status.wait();
+  }
+}
+
+void GraphPyClient::add_graph_node(std::string name,
+                                   std::vector<uint64_t>& node_ids,
+                                   std::vector<bool>& weight_list) {
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        get_ps_client()->add_graph_node(table_id, node_ids, weight_list);
+    status.wait();
+  }
+}
+
+void GraphPyClient::remove_graph_node(std::string name,
+                                      std::vector<uint64_t>& node_ids) {
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status = get_ps_client()->remove_graph_node(table_id, node_ids);
+    status.wait();
+  }
+}
+
 void GraphPyClient::load_node_file(std::string name, std::string filepath) {
   // 'n' means load nodes and 'node_type' follows
   std::string params = "n" + name;
diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h
index c6657be96ba44..da027fbae3e6f 100644
--- a/paddle/fluid/distributed/service/graph_py_service.h
+++ b/paddle/fluid/distributed/service/graph_py_service.h
@@ -141,6 +141,10 @@ class GraphPyClient : public GraphPyService {
   void finalize_worker();
   void load_edge_file(std::string name, std::string filepath, bool reverse);
   void load_node_file(std::string name, std::string filepath);
+  void clear_nodes(std::string name);
+  void add_graph_node(std::string name, std::vector<uint64_t>& node_ids,
+                      std::vector<bool>& weight_list);
+  void remove_graph_node(std::string name, std::vector<uint64_t>& node_ids);
   int get_client_id() { return client_id; }
   void set_client_id(int client_id) { this->client_id = client_id; }
   void start_client();
diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto
index d908c26da9870..a4b811e950a3b 100644
--- a/paddle/fluid/distributed/service/sendrecv.proto
+++ b/paddle/fluid/distributed/service/sendrecv.proto
@@ -52,6 +52,9 @@ enum PsCmdID {
   PS_GRAPH_SAMPLE_NEIGHBOORS = 31;
   PS_GRAPH_SAMPLE_NODES = 32;
   PS_GRAPH_GET_NODE_FEAT = 33;
+  PS_GRAPH_CLEAR = 34;
+  PS_GRAPH_ADD_GRAPH_NODE = 35;
+  PS_GRAPH_REMOVE_GRAPH_NODE = 36;
 }
 
 message PsRequestMessage {
diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc
index 0dc99de1bfe82..92f8304a8bf62 100644
--- a/paddle/fluid/distributed/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/table/common_graph_table.cc
@@ -35,6 +35,77 @@ std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
 
 size_t GraphShard::get_size() { return bucket.size(); }
 
+int32_t GraphTable::add_graph_node(std::vector<uint64_t> &id_list,
+                                   std::vector<bool> &is_weight_list) {
+  size_t node_size = id_list.size();
+  std::vector<std::vector<std::pair<uint64_t, bool>>> batch(task_pool_size_);
+  for (size_t i = 0; i < node_size; i++) {
+    size_t shard_id = id_list[i] % shard_num;
+    if (shard_id >= shard_end || shard_id < shard_start) {
+      continue;
+    }
+    batch[get_thread_pool_index(id_list[i])].push_back(
+        {id_list[i], i < is_weight_list.size() ? is_weight_list[i] : false});
+  }
+  std::vector<std::future<int>> tasks;
+  for (size_t i = 0; i < batch.size(); ++i) {
+    if (!batch[i].size()) continue;
+    tasks.push_back(_shards_task_pool[i]->enqueue([&batch, i, this]() -> int {
+      for (auto &p : batch[i]) {
+        size_t index = p.first % this->shard_num - this->shard_start;
+        this->shards[index].add_graph_node(p.first)->build_edges(p.second);
+      }
+      return 0;
+    }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  return 0;
+}
+
+int32_t GraphTable::remove_graph_node(std::vector<uint64_t> &id_list) {
+  size_t node_size = id_list.size();
+  std::vector<std::vector<uint64_t>> batch(task_pool_size_);
+  for (size_t i = 0; i < node_size; i++) {
+    size_t shard_id = id_list[i] % shard_num;
+    if (shard_id >= shard_end || shard_id < shard_start) continue;
+    batch[get_thread_pool_index(id_list[i])].push_back(id_list[i]);
+  }
+  std::vector<std::future<int>> tasks;
+  for (size_t i = 0; i < batch.size(); ++i) {
+    if (!batch[i].size()) continue;
+    tasks.push_back(_shards_task_pool[i]->enqueue([&batch, i, this]() -> int {
+      for (auto &p : batch[i]) {
+        size_t index = p % this->shard_num - this->shard_start;
+        this->shards[index].delete_node(p);
+      }
+      return 0;
+    }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  return 0;
+}
+
+void GraphShard::clear() {
+  for (size_t i = 0; i < bucket.size(); i++) {
+    delete bucket[i];
+  }
+  bucket.clear();
+  node_location.clear();
+}
+
+GraphShard::~GraphShard() { clear(); }
+void GraphShard::delete_node(uint64_t id) {
+  auto iter = node_location.find(id);
+  if (iter == node_location.end()) return;
+  int pos = iter->second;
+  delete bucket[pos];
+  if (pos != (int)bucket.size() - 1) {
+    bucket[pos] = bucket.back();
+    node_location[bucket.back()->get_id()] = pos;
+  }
+  node_location.erase(id);
+  bucket.pop_back();
+}
 GraphNode *GraphShard::add_graph_node(uint64_t id) {
   if (node_location.find(id) == node_location.end()) {
     node_location[id] = bucket.size();
@@ -79,11 +150,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
   int start = 0, end, index = 0, total_size = 0;
   res.clear();
   std::vector<std::future<std::vector<uint64_t>>> tasks;
-  // std::string temp = "";
-  // for(int i = 0;i < shards.size();i++)
-  //   temp+= std::to_string((int)shards[i].get_size()) + " ";
-  // VLOG(0)<<"range distribution "<<temp;
-  for (int i = 0; i < shards.size() && index < ranges.size(); i++) {
+  for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) {
     end = total_size + shards[i].get_size();
     start = total_size;
     while (start < end && index < ranges.size()) {
@@ -97,7 +164,6 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
         start = second;
         first -= total_size;
         second -= total_size;
-        // VLOG(0)<<" FIND RANGE "<<i<<" "<<first<<" "<<second;
         tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
             [this, first, second, i]() -> std::vector<uint64_t> {
               return shards[i].get_ids_by_range(first, second);
@@ -106,7 +172,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
     }
     total_size += shards[i].get_size();
   }
-  for (int i = 0; i < tasks.size(); i++) {
+  for (size_t i = 0; i < tasks.size(); i++) {
     auto vec = tasks[i].get();
     for (auto &id : vec) {
       res.push_back(id);
@@ -219,7 +285,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
 
   for (auto &shard : shards) {
     auto bucket = shard.get_bucket();
-    for (int i = 0; i < bucket.size(); i++) {
+    for (size_t i = 0; i < bucket.size(); i++) {
       bucket[i]->build_sampler(sample_type);
     }
   }
@@ -238,10 +304,29 @@ Node *GraphTable::find_node(uint64_t id) {
 uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
   return node_id % shard_num % shard_num_per_table % task_pool_size_;
 }
+
+uint32_t GraphTable::get_thread_pool_index_by_shard_index(
+    uint64_t shard_index) {
+  return shard_index % shard_num_per_table % task_pool_size_;
+}
+
+int32_t GraphTable::clear_nodes() {
+  std::vector<std::future<int>> tasks;
+  for (size_t i = 0; i < shards.size(); i++) {
+    tasks.push_back(
+        _shards_task_pool[get_thread_pool_index_by_shard_index(i)]->enqueue(
+            [this, i]() -> int {
+              this->shards[i].clear();
+              return 0;
+            }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  return 0;
+}
+
 int32_t GraphTable::random_sample_nodes(int sample_size,
                                         std::unique_ptr<char[]> &buffer,
                                         int &actual_size) {
-  bool need_feature = false;
   int total_size = 0;
   for (int i = 0; i < shards.size(); i++) {
     total_size += shards[i].get_size();
@@ -281,7 +366,7 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
   }
   std::vector<std::pair<int, int>> first_half, second_half;
   int start_index = rand() % total_size;
-  for (int i = 0; i < ranges_len.size() && i < ranges_pos.size(); i++) {
+  for (size_t i = 0; i < ranges_len.size() && i < ranges_pos.size(); i++) {
     if (ranges_pos[i] + ranges_len[i] - 1 + start_index < total_size)
       first_half.push_back({ranges_pos[i] + start_index,
                             ranges_pos[i] + ranges_len[i] + start_index});
@@ -386,7 +471,6 @@ std::pair<int32_t, std::string> GraphTable::parse_feature(
   if (this->feat_id_map.count(fields[0])) {
     int32_t id = this->feat_id_map[fields[0]];
     std::string dtype = this->feat_dtype[id];
-    int32_t shape = this->feat_shape[id];
     std::vector<std::string> values(fields.begin() + 1, fields.end());
     if (dtype == "feasign") {
       return std::make_pair<int32_t, std::string>(
diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h
index b18da82abe61c..5eeb3915f5b1f 100644
--- a/paddle/fluid/distributed/table/common_graph_table.h
+++ b/paddle/fluid/distributed/table/common_graph_table.h
@@ -36,11 +36,12 @@ class GraphShard {
   size_t get_size();
   GraphShard() {}
   GraphShard(int shard_num) { this->shard_num = shard_num; }
+  ~GraphShard();
   std::vector<Node *> &get_bucket() { return bucket; }
   std::vector<Node *> get_batch(int start, int end, int step);
   std::vector<uint64_t> get_ids_by_range(int start, int end) {
     std::vector<uint64_t> res;
-    for (int i = start; i < end && i < bucket.size(); i++) {
+    for (int i = start; i < end && i < (int)bucket.size(); i++) {
       res.push_back(bucket[i]->get_id());
     }
     return res;
@@ -48,6 +49,8 @@ class GraphShard {
   GraphNode *add_graph_node(uint64_t id);
   FeatureNode *add_feature_node(uint64_t id);
   Node *find_node(uint64_t id);
+  void delete_node(uint64_t id);
+  void clear();
   void add_neighboor(uint64_t id, uint64_t dst_id, float weight);
   std::unordered_map<uint64_t, int> get_node_location() {
     return node_location;
@@ -85,6 +88,11 @@ class GraphTable : public SparseTable {
 
   int32_t load_nodes(const std::string &path, std::string node_type);
 
+  int32_t add_graph_node(std::vector<uint64_t> &id_list,
+                         std::vector<bool> &is_weight_list);
+
+  int32_t remove_graph_node(std::vector<uint64_t> &id_list);
+
   Node *find_node(uint64_t id);
 
   virtual int32_t pull_sparse(float *values,
@@ -97,6 +105,7 @@ class GraphTable : public SparseTable {
     return 0;
   }
 
+  virtual int32_t clear_nodes();
   virtual void clear() {}
   virtual int32_t flush() { return 0; }
   virtual int32_t shrink(const std::string &param) { return 0; }
@@ -105,6 +114,7 @@ class GraphTable : public SparseTable {
     return 0;
   }
   virtual int32_t initialize_shard() { return 0; }
+  virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index);
   virtual uint32_t get_thread_pool_index(uint64_t node_id);
   virtual std::pair<int32_t, std::string> parse_feature(std::string feat_str);
 
@@ -128,4 +138,5 @@ class GraphTable : public SparseTable {
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
 };
 }  // namespace distributed
+
 };  // namespace paddle
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc
index 718fce9950719..a4f672c2963a8 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/table/common_sparse_table.h"
-
 #include <sstream>
 
+#include "boost/lexical_cast.hpp"
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -25,7 +25,8 @@ class ValueBlock;
 }  // namespace distributed
 }  // namespace paddle
 
-#define PSERVER_SAVE_SUFFIX "_txt"
+#define PSERVER_SAVE_SUFFIX ".shard"
+using boost::lexical_cast;
 
 namespace paddle {
 namespace distributed {
@@ -100,7 +101,7 @@ struct Meta {
 };
 
 void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
-                  std::vector<std::vector<float>>* values) {
+                  const int64_t id, std::vector<std::vector<float>>* values) {
   auto colunmn_size = columns.size();
   auto load_values =
       paddle::string::split_string<std::string>(columns[colunmn_size - 1], ",");
@@ -116,8 +117,18 @@ void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
                           "The data format in txt does not meet the field "
                           "requirements defined in meta"));
 
-    std::transform(start, end, std::back_inserter(val),
-                   [](std::string va) { return std::stof(va); });
+    std::transform(start, end, std::back_inserter(val), [id](std::string va) {
+      float v = 0.0;
+
+      try {
+        v = lexical_cast<float>(va);
+      } catch (boost::bad_lexical_cast& e) {
+        VLOG(0) << "id: " << id << " get unexpected value: " << va
+                << " and be reset to: 0.0";
+      }
+      return v;
+    });
+
     values->push_back(val);
     offset += meta.dims[x];
   }
@@ -126,25 +137,29 @@ void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
 int64_t SaveToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
                    const int mode) {
   int64_t save_num = 0;
+
   for (auto& table : block->values_) {
     for (auto& value : table) {
       if (mode == SaveMode::delta && !value.second->need_save_) {
         continue;
       }
-      save_num += 1;
 
-      auto* vs = value.second->data_.data();
+      ++save_num;
+
       std::stringstream ss;
+      auto* vs = value.second->data_.data();
+
       auto id = value.first;
+
       ss << id << "\t" << value.second->count_ << "\t"
          << value.second->unseen_days_ << "\t" << value.second->is_entry_
          << "\t";
 
-      for (int i = 0; i < block->value_length_; i++) {
-        ss << vs[i];
-        ss << ",";
+      for (int i = 0; i < block->value_length_ - 1; i++) {
+        ss << std::to_string(vs[i]) << ",";
       }
 
+      ss << std::to_string(vs[block->value_length_ - 1]);
       ss << "\n";
 
       os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
@@ -170,7 +185,7 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
 
   while (std::getline(file, line)) {
     auto values = paddle::string::split_string<std::string>(line, "\t");
-    auto id = std::stoull(values[0]);
+    auto id = lexical_cast<int64_t>(values[0]);
 
     if (id % pserver_num != pserver_id) {
       VLOG(3) << "will not load " << values[0] << " from " << valuepath
@@ -182,15 +197,17 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
     auto block = blocks->at(shard_id);
 
     std::vector<std::vector<float>> kvalues;
-    ProcessALine(values, meta, &kvalues);
+    ProcessALine(values, meta, id, &kvalues);
 
     block->Init(id, false);
 
     VALUE* value_instant = block->GetValue(id);
+
     if (values.size() == 5) {
-      value_instant->count_ = std::stoi(values[1]);
-      value_instant->unseen_days_ = std::stoi(values[2]);
-      value_instant->is_entry_ = static_cast<bool>(std::stoi(values[3]));
+      value_instant->count_ = lexical_cast<int>(values[1]);
+      value_instant->unseen_days_ = lexical_cast<int>(values[2]);
+      value_instant->is_entry_ =
+          static_cast<bool>(lexical_cast<int>(values[3]));
     }
 
     std::vector<float*> block_values = block->Get(id, meta.names, meta.dims);
@@ -475,7 +492,7 @@ int32_t CommonSparseTable::pull_sparse_ptr(char** pull_values,
             auto* value = block->InitGet(id);
             // std::copy_n(value + param_offset_, param_dim_,
             //            pull_values + param_dim_ * offset);
-            pull_values[offset] = (char*)value;
+            pull_values[offset] = reinterpret_cast<char*>(value);
           }
 
           return 0;
diff --git a/paddle/fluid/distributed/table/graph_edge.cc b/paddle/fluid/distributed/table/graph_edge.cc
deleted file mode 100644
index cc90f4c6516c1..0000000000000
--- a/paddle/fluid/distributed/table/graph_edge.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/distributed/table/graph_edge.h"
-#include <cstring>
-namespace paddle {
-namespace distributed {
-
-void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
-  id_arr.push_back(id);
-}
-
-void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
-  id_arr.push_back(id);
-  weight_arr.push_back(weight);
-}
-}
-}
diff --git a/paddle/fluid/distributed/table/graph_edge.h b/paddle/fluid/distributed/table/graph_edge.h
deleted file mode 100644
index 3dfe5a6f357a7..0000000000000
--- a/paddle/fluid/distributed/table/graph_edge.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-namespace paddle {
-namespace distributed {
-
-class GraphEdgeBlob {
- public:
-  GraphEdgeBlob() {}
-  virtual ~GraphEdgeBlob() {}
-  size_t size() { return id_arr.size(); }
-  virtual void add_edge(uint64_t id, float weight);
-  uint64_t get_id(int idx) { return id_arr[idx]; }
-  virtual float get_weight(int idx) { return 1; }
-
- protected:
-  std::vector<uint64_t> id_arr;
-};
-
-class WeightedGraphEdgeBlob : public GraphEdgeBlob {
- public:
-  WeightedGraphEdgeBlob() {}
-  virtual ~WeightedGraphEdgeBlob() {}
-  virtual void add_edge(uint64_t id, float weight);
-  virtual float get_weight(int idx) { return weight_arr[idx]; }
-
- protected:
-  std::vector<float> weight_arr;
-};
-}
-}
diff --git a/paddle/fluid/distributed/table/graph_node.cc b/paddle/fluid/distributed/table/graph_node.cc
deleted file mode 100644
index 27a2cafaf4f0f..0000000000000
--- a/paddle/fluid/distributed/table/graph_node.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/distributed/table/graph_node.h"
-#include <cstring>
-namespace paddle {
-namespace distributed {
-
-GraphNode::~GraphNode() {
-  if (sampler != nullptr) {
-    delete sampler;
-    sampler = nullptr;
-  }
-  if (edges != nullptr) {
-    delete edges;
-    edges = nullptr;
-  }
-}
-
-int Node::weight_size = sizeof(float);
-int Node::id_size = sizeof(uint64_t);
-int Node::int_size = sizeof(int);
-
-int Node::get_size(bool need_feature) { return id_size + int_size; }
-
-void Node::to_buffer(char* buffer, bool need_feature) {
-  memcpy(buffer, &id, id_size);
-  buffer += id_size;
-
-  int feat_num = 0;
-  memcpy(buffer, &feat_num, sizeof(int));
-}
-
-void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); }
-
-int FeatureNode::get_size(bool need_feature) {
-  int size = id_size + int_size;  // id, feat_num
-  if (need_feature) {
-    size += feature.size() * int_size;
-    for (const std::string& fea : feature) {
-      size += fea.size();
-    }
-  }
-  return size;
-}
-
-void GraphNode::build_edges(bool is_weighted) {
-  if (edges == nullptr) {
-    if (is_weighted == true) {
-      edges = new WeightedGraphEdgeBlob();
-    } else {
-      edges = new GraphEdgeBlob();
-    }
-  }
-}
-void GraphNode::build_sampler(std::string sample_type) {
-  if (sample_type == "random") {
-    sampler = new RandomSampler();
-  } else if (sample_type == "weighted") {
-    sampler = new WeightedSampler();
-  }
-  sampler->build(edges);
-}
-void FeatureNode::to_buffer(char* buffer, bool need_feature) {
-  memcpy(buffer, &id, id_size);
-  buffer += id_size;
-
-  int feat_num = 0;
-  int feat_len;
-  if (need_feature) {
-    feat_num += feature.size();
-    memcpy(buffer, &feat_num, sizeof(int));
-    buffer += sizeof(int);
-    for (int i = 0; i < feat_num; ++i) {
-      feat_len = feature[i].size();
-      memcpy(buffer, &feat_len, sizeof(int));
-      buffer += sizeof(int);
-      memcpy(buffer, feature[i].c_str(), feature[i].size());
-      buffer += feature[i].size();
-    }
-  } else {
-    memcpy(buffer, &feat_num, sizeof(int));
-  }
-}
-void FeatureNode::recover_from_buffer(char* buffer) {
-  int feat_num, feat_len;
-  memcpy(&id, buffer, id_size);
-  buffer += id_size;
-
-  memcpy(&feat_num, buffer, sizeof(int));
-  buffer += sizeof(int);
-
-  feature.clear();
-  for (int i = 0; i < feat_num; ++i) {
-    memcpy(&feat_len, buffer, sizeof(int));
-    buffer += sizeof(int);
-
-    char str[feat_len + 1];
-    memcpy(str, buffer, feat_len);
-    buffer += feat_len;
-    str[feat_len] = '\0';
-    feature.push_back(std::string(str));
-  }
-}
-}
-}
diff --git a/paddle/fluid/distributed/table/graph_node.h b/paddle/fluid/distributed/table/graph_node.h
deleted file mode 100644
index c3e8e3ce5b50d..0000000000000
--- a/paddle/fluid/distributed/table/graph_node.h
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cstring>
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include "paddle/fluid/distributed/table/graph_weighted_sampler.h"
-namespace paddle {
-namespace distributed {
-
-class Node {
- public:
-  Node() {}
-  Node(uint64_t id) : id(id) {}
-  virtual ~Node() {}
-  static int id_size, int_size, weight_size;
-  uint64_t get_id() { return id; }
-  void set_id(uint64_t id) { this->id = id; }
-
-  virtual void build_edges(bool is_weighted) {}
-  virtual void build_sampler(std::string sample_type) {}
-  virtual void add_edge(uint64_t id, float weight) {}
-  virtual std::vector<int> sample_k(int k) { return std::vector<int>(); }
-  virtual uint64_t get_neighbor_id(int idx) { return 0; }
-  virtual float get_neighbor_weight(int idx) { return 1.; }
-
-  virtual int get_size(bool need_feature);
-  virtual void to_buffer(char *buffer, bool need_feature);
-  virtual void recover_from_buffer(char *buffer);
-  virtual std::string get_feature(int idx) { return std::string(""); }
-  virtual void set_feature(int idx, std::string str) {}
-  virtual void set_feature_size(int size) {}
-  virtual int get_feature_size() { return 0; }
-
- protected:
-  uint64_t id;
-};
-
-class GraphNode : public Node {
- public:
-  GraphNode() : Node(), sampler(nullptr), edges(nullptr) {}
-  GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {}
-  virtual ~GraphNode();
-  virtual void build_edges(bool is_weighted);
-  virtual void build_sampler(std::string sample_type);
-  virtual void add_edge(uint64_t id, float weight) {
-    edges->add_edge(id, weight);
-  }
-  virtual std::vector<int> sample_k(int k) { return sampler->sample_k(k); }
-  virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); }
-  virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); }
-
- protected:
-  Sampler *sampler;
-  GraphEdgeBlob *edges;
-};
-
-class FeatureNode : public Node {
- public:
-  FeatureNode() : Node() {}
-  FeatureNode(uint64_t id) : Node(id) {}
-  virtual ~FeatureNode() {}
-  virtual int get_size(bool need_feature);
-  virtual void to_buffer(char *buffer, bool need_feature);
-  virtual void recover_from_buffer(char *buffer);
-  virtual std::string get_feature(int idx) {
-    if (idx < (int)this->feature.size()) {
-      return this->feature[idx];
-    } else {
-      return std::string("");
-    }
-  }
-
-  virtual void set_feature(int idx, std::string str) {
-    if (idx >= (int)this->feature.size()) {
-      this->feature.resize(idx + 1);
-    }
-    this->feature[idx] = str;
-  }
-  virtual void set_feature_size(int size) { this->feature.resize(size); }
-  virtual int get_feature_size() { return this->feature.size(); }
-
-  template <typename T>
-  static std::string parse_value_to_bytes(std::vector<std::string> feat_str) {
-    T v;
-    size_t Tsize = sizeof(T) * feat_str.size();
-    char buffer[Tsize];
-    for (size_t i = 0; i < feat_str.size(); i++) {
-      std::stringstream ss(feat_str[i]);
-      ss >> v;
-      std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T));
-    }
-    return std::string(buffer, Tsize);
-  }
-
-  template <typename T>
-  static std::vector<T> parse_bytes_to_array(std::string feat_str) {
-    T v;
-    std::vector<T> out;
-    size_t start = 0;
-    const char *buffer = feat_str.data();
-    while (start < feat_str.size()) {
-      std::memcpy((char *)&v, buffer + start, sizeof(T));
-      start += sizeof(T);
-      out.push_back(v);
-    }
-    return out;
-  }
-
- protected:
-  std::vector<std::string> feature;
-};
-}
-}
diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph_weighted_sampler.cc
deleted file mode 100644
index 059a1d64bc392..0000000000000
--- a/paddle/fluid/distributed/table/graph_weighted_sampler.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/distributed/table/graph_weighted_sampler.h"
-#include <iostream>
-#include <unordered_map>
-namespace paddle {
-namespace distributed {
-
-void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; }
-
-std::vector<int> RandomSampler::sample_k(int k) {
-  int n = edges->size();
-  if (k > n) {
-    k = n;
-  }
-  struct timespec tn;
-  clock_gettime(CLOCK_REALTIME, &tn);
-  srand(tn.tv_nsec);
-  std::vector<int> sample_result;
-  std::unordered_map<int, int> replace_map;
-  while (k--) {
-    int rand_int = rand() % n;
-    auto iter = replace_map.find(rand_int);
-    if (iter == replace_map.end()) {
-      sample_result.push_back(rand_int);
-    } else {
-      sample_result.push_back(iter->second);
-    }
-
-    iter = replace_map.find(n - 1);
-    if (iter == replace_map.end()) {
-      replace_map[rand_int] = n - 1;
-    } else {
-      replace_map[rand_int] = iter->second;
-    }
-    --n;
-  }
-  return sample_result;
-}
-
-WeightedSampler::WeightedSampler() {
-  left = nullptr;
-  right = nullptr;
-  edges = nullptr;
-}
-
-WeightedSampler::~WeightedSampler() {
-  if (left != nullptr) {
-    delete left;
-    left = nullptr;
-  }
-  if (right != nullptr) {
-    delete right;
-    right = nullptr;
-  }
-}
-
-void WeightedSampler::build(GraphEdgeBlob *edges) {
-  if (left != nullptr) {
-    delete left;
-    left = nullptr;
-  }
-  if (right != nullptr) {
-    delete right;
-    right = nullptr;
-  }
-  return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size());
-}
-
-void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start,
-                                int end) {
-  count = 0;
-  this->edges = edges;
-  if (start + 1 == end) {
-    left = right = nullptr;
-    idx = start;
-    count = 1;
-    weight = edges->get_weight(idx);
-
-  } else {
-    left = new WeightedSampler();
-    right = new WeightedSampler();
-    left->build_one(edges, start, start + (end - start) / 2);
-    right->build_one(edges, start + (end - start) / 2, end);
-    weight = left->weight + right->weight;
-    count = left->count + right->count;
-  }
-}
-std::vector<int> WeightedSampler::sample_k(int k) {
-  if (k > count) {
-    k = count;
-  }
-  std::vector<int> sample_result;
-  float subtract;
-  std::unordered_map<WeightedSampler *, float> subtract_weight_map;
-  std::unordered_map<WeightedSampler *, int> subtract_count_map;
-  struct timespec tn;
-  clock_gettime(CLOCK_REALTIME, &tn);
-  srand(tn.tv_nsec);
-  while (k--) {
-    float query_weight = rand() % 100000 / 100000.0;
-    query_weight *= weight - subtract_weight_map[this];
-    sample_result.push_back(sample(query_weight, subtract_weight_map,
-                                   subtract_count_map, subtract));
-  }
-  return sample_result;
-}
-
-int WeightedSampler::sample(
-    float query_weight,
-    std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
-    std::unordered_map<WeightedSampler *, int> &subtract_count_map,
-    float &subtract) {
-  if (left == nullptr) {
-    subtract_weight_map[this] = weight;
-    subtract = weight;
-    subtract_count_map[this] = 1;
-    return idx;
-  }
-  int left_count = left->count - subtract_count_map[left];
-  int right_count = right->count - subtract_count_map[right];
-  float left_subtract = subtract_weight_map[left];
-  int return_idx;
-  if (right_count == 0 ||
-      left_count > 0 && left->weight - left_subtract >= query_weight) {
-    return_idx = left->sample(query_weight, subtract_weight_map,
-                              subtract_count_map, subtract);
-  } else {
-    return_idx =
-        right->sample(query_weight - (left->weight - left_subtract),
-                      subtract_weight_map, subtract_count_map, subtract);
-  }
-  subtract_weight_map[this] += subtract;
-  subtract_count_map[this]++;
-  return return_idx;
-}
-}
-}
diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph_weighted_sampler.h
deleted file mode 100644
index cfc341d27c6b7..0000000000000
--- a/paddle/fluid/distributed/table/graph_weighted_sampler.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <ctime>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/distributed/table/graph_edge.h"
-namespace paddle {
-namespace distributed {
-
-class Sampler {
- public:
-  virtual ~Sampler() {}
-  virtual void build(GraphEdgeBlob *edges) = 0;
-  virtual std::vector<int> sample_k(int k) = 0;
-};
-
-class RandomSampler : public Sampler {
- public:
-  virtual ~RandomSampler() {}
-  virtual void build(GraphEdgeBlob *edges);
-  virtual std::vector<int> sample_k(int k);
-  GraphEdgeBlob *edges;
-};
-
-class WeightedSampler : public Sampler {
- public:
-  WeightedSampler();
-  virtual ~WeightedSampler();
-  WeightedSampler *left, *right;
-  float weight;
-  int count;
-  int idx;
-  GraphEdgeBlob *edges;
-  virtual void build(GraphEdgeBlob *edges);
-  virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end);
-  virtual std::vector<int> sample_k(int k);
-
- private:
-  int sample(float query_weight,
-             std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
-             std::unordered_map<WeightedSampler *, int> &subtract_count_map,
-             float &subtract);
-};
-}
-}
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index b268bb449e146..b8630aed02ffe 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -124,7 +124,6 @@ void testSingleSampleNeighboor(
   for (auto g : s) {
     ASSERT_EQ(true, s1.find(g) != s1.end());
   }
-  VLOG(0) << "test single done";
   s.clear();
   s1.clear();
   vs.clear();
@@ -141,6 +140,57 @@ void testSingleSampleNeighboor(
   }
 }
 
+void testAddNode(
+    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
+  worker_ptr_->clear_nodes(0);
+  int total_num = 270000;
+  uint64_t id;
+  std::unordered_set<uint64_t> id_set;
+  for (int i = 0; i < total_num; i++) {
+    while (id_set.find(id = rand()) != id_set.end())
+      ;
+    id_set.insert(id);
+  }
+  std::vector<uint64_t> id_list(id_set.begin(), id_set.end());
+  std::vector<bool> weight_list;
+  auto status = worker_ptr_->add_graph_node(0, id_list, weight_list);
+  status.wait();
+  std::vector<uint64_t> ids[2];
+  for (int i = 0; i < 2; i++) {
+    auto sample_status =
+        worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]);
+    sample_status.wait();
+  }
+  std::unordered_set<uint64_t> id_set_check(ids[0].begin(), ids[0].end());
+  for (auto x : ids[1]) id_set_check.insert(x);
+  ASSERT_EQ(id_set.size(), id_set_check.size());
+  for (auto x : id_set) {
+    ASSERT_EQ(id_set_check.find(x) != id_set_check.end(), true);
+  }
+  std::vector<uint64_t> remove_ids;
+  for (auto p : id_set_check) {
+    if (remove_ids.size() == 0)
+      remove_ids.push_back(p);
+    else if (remove_ids.size() < total_num / 2 && rand() % 2 == 1) {
+      remove_ids.push_back(p);
+    }
+  }
+  for (auto p : remove_ids) id_set_check.erase(p);
+  status = worker_ptr_->remove_graph_node(0, remove_ids);
+  status.wait();
+  for (int i = 0; i < 2; i++) ids[i].clear();
+  for (int i = 0; i < 2; i++) {
+    auto sample_status =
+        worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]);
+    sample_status.wait();
+  }
+  std::unordered_set<uint64_t> id_set_check1(ids[0].begin(), ids[0].end());
+  for (auto x : ids[1]) id_set_check1.insert(x);
+  ASSERT_EQ(id_set_check1.size(), id_set_check.size());
+  for (auto x : id_set_check1) {
+    ASSERT_EQ(id_set_check.find(x) != id_set_check.end(), true);
+  }
+}
 void testBatchSampleNeighboor(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
   std::vector<std::vector<std::pair<uint64_t, float>>> vs;
@@ -527,6 +577,7 @@ void RunBrpcPushSparse() {
 
   std::remove(edge_file_name);
   std::remove(node_file_name);
+  testAddNode(worker_ptr_);
   LOG(INFO) << "Run stop_server";
   worker_ptr_->stop_server();
   LOG(INFO) << "Run finalize_worker";
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index e55fca403af3a..db2f9c9fc5fc5 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -100,8 +100,16 @@ if (WITH_GPU)
 endif()
 cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
 
+set(BRPC_DEPS "")
+if(WITH_PSLIB OR WITH_PSCORE)
+    set(BRPC_DEPS brpc)
+    if(WITH_PSLIB_BRPC)
+        set(BRPC_DEPS pslib_brpc)
+    endif()
+endif()
+
 cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
-cc_library(device_worker SRCS device_worker.cc DEPS trainer_desc_proto lod_tensor scope)
+cc_library(device_worker SRCS device_worker.cc DEPS trainer_desc_proto lod_tensor scope ${BRPC_DEPS})
 cc_test(device_worker_test SRCS device_worker_test.cc DEPS device_worker)
 
 cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
@@ -243,9 +251,16 @@ if(WITH_DISTRIBUTE)
     fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer
     lod_rank_table feed_fetch_method collective_helper ${GLOB_DISTRIBUTE_DEPS}
     graph_to_program_pass variable_helper data_feed_proto timer monitor
-    heter_service_proto pslib_brpc)
+    heter_service_proto ${BRPC_DEP})
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+        set(DISTRIBUTE_COMPILE_FLAGS
+                "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+    endif()
     set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   elseif(WITH_PSCORE)
     cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
             dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
@@ -272,6 +287,15 @@ if(WITH_DISTRIBUTE)
             graph_to_program_pass variable_helper timer monitor)
   endif()
 elseif(WITH_PSLIB)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      set(DISTRIBUTE_COMPILE_FLAGS
+              "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+  endif()
+  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
   dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
   heterxpu_trainer.cc
@@ -280,7 +304,7 @@ elseif(WITH_PSLIB)
   pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
   device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
   lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
-  graph_to_program_pass variable_helper timer monitor pslib_brpc )
+  graph_to_program_pass variable_helper timer monitor ${BRPC_DEP})
 else()
   cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
   dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 8ff94b0277c0c..8708d90485af8 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -143,7 +143,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
 
 void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
                                     const Tensor& in, Tensor* out,
-                                    platform::Place place) {
+                                    platform::Place place, bool always_copy) {
   PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::undef,
                     platform::errors::InvalidArgument(
                         "Input tensor format is invalid. Input tensor should "
@@ -177,7 +177,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
   // output tensor has the same dims as input. Reorder don't change dims
   out->Resize(in.dims());
 
-  if (in_format != out_format) {
+  if ((in_format != out_format) || always_copy) {
     void* in_data = GetDataFromTensor(in, in_type);
     std::string key =
         platform::CreateKey(*dev_ctx, in_tz, in_format, out_format, in_type);
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 238f2d2e67914..3404ba2db67e5 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -78,7 +78,8 @@ inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) {
 
 void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
                                     const Tensor& in, Tensor* out,
-                                    platform::Place place);
+                                    platform::Place place,
+                                    bool always_copy = false);
 
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                                const OpKernelType& expected_kernel_type,
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index c8f73a5469ab3..648a32420aa6c 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/eigen_ext.h"
@@ -30,6 +31,8 @@ struct bfloat16;
 struct complex128;
 struct complex64;
 struct float16;
+template <typename T>
+struct complex;
 }  // namespace platform
 }  // namespace paddle
 
@@ -61,6 +64,10 @@ struct DataTypeTrait<void> {
   _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                           \
   _ForEachDataTypeHelper_(callback, int16_t, INT16);                           \
   _ForEachDataTypeHelper_(callback, int8_t, INT8);                             \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>,        \
+                          COMPLEX64);                                          \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>,       \
+                          COMPLEX128);                                         \
   _ForEachDataTypeHelper_(callback, ::paddle::platform::complex64, COMPLEX64); \
   _ForEachDataTypeHelper_(callback, ::paddle::platform::complex128, COMPLEX128);
 
@@ -69,6 +76,10 @@ struct DataTypeTrait<void> {
   _ForEachDataTypeHelper_(callback, double, FP64);                             \
   _ForEachDataTypeHelper_(callback, int, INT32);                               \
   _ForEachDataTypeHelper_(callback, int64_t, INT64);                           \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>,        \
+                          COMPLEX64);                                          \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>,       \
+                          COMPLEX128);                                         \
   _ForEachDataTypeHelper_(callback, ::paddle::platform::complex64, COMPLEX64); \
   _ForEachDataTypeHelper_(callback, ::paddle::platform::complex128, COMPLEX128);
 
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 0fdb97db20af9..829772448eb91 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -163,6 +163,11 @@ static void PrintNanInf(const T* value, const size_t numel, int print_num,
                               omp_in)
 #pragma omp declare reduction(+ : paddle::platform::complex128 : omp_out += \
                               omp_in)
+#pragma omp declare reduction(+ : paddle::platform::complex < \
+                                  float > : omp_out += omp_in)
+#pragma omp declare reduction(+ : paddle::platform::complex < \
+                                  double > : omp_out += omp_in)
+
 #endif
 
 template <typename T>
@@ -268,12 +273,69 @@ void CheckNanInf<paddle::platform::complex128>(
         op_type));
   }
 }
+
+template <>
+void CheckNanInf<paddle::platform::complex<float>>(
+    const paddle::platform::complex<float>* value, const size_t numel,
+    int print_num, const std::string& op_type, const std::string& var_name) {
+  float real_sum = 0.0f;
+#pragma omp parallel for reduction(+ : real_sum)
+  for (size_t i = 0; i < numel; ++i) {
+    real_sum += (value[i].real - value[i].real);
+  }
+
+  float imag_sum = 0.0f;
+#pragma omp parallel for reduction(+ : imag_sum)
+  for (size_t i = 0; i < numel; ++i) {
+    imag_sum += (value[i].imag - value[i].imag);
+  }
+
+  if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) ||
+      std::isinf(imag_sum)) {
+    // hot fix for compile failed in gcc4.8
+    // here also need print detail info of nan or inf later
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "There are `nan` or `inf` in tensor (%s) of operator (%s).", var_name,
+        op_type));
+  }
+}
+
+template <>
+    void CheckNanInf<paddle::platform::complex<double>>>
+    (const paddle::platform::complex<double>* value, const size_t numel,
+     int print_num, const std::string& op_type, const std::string& var_name) {
+  double real_sum = 0.0;
+#pragma omp parallel for reduction(+ : real_sum)
+  for (size_t i = 0; i < numel; ++i) {
+    real_sum += (value[i].real - value[i].real);
+  }
+
+  double imag_sum = 0.0;
+#pragma omp parallel for reduction(+ : imag_sum)
+  for (size_t i = 0; i < numel; ++i) {
+    imag_sum += (value[i].imag - value[i].imag);
+  }
+
+  if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) ||
+      std::isinf(imag_sum)) {
+    // hot fix for compile failed in gcc4.8
+    // here also need print detail info of nan or inf later
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "There are `nan` or `inf` in tensor (%s) of operator (%s).", var_name,
+        op_type));
+  }
+}
+
 #endif
 
 template <>
 template <typename T>
 void TensorCheckerVisitor<platform::CPUDeviceContext>::apply(
-    typename std::enable_if<std::is_floating_point<T>::value>::type*) const {
+    typename std::enable_if<
+        std::is_floating_point<T>::value ||
+        std::is_same<T, ::paddle::platform::complex<float>>::value ||
+        std::is_same<T, ::paddle::platform::complex<double>>::value>::type*)
+    const {
   // use env strategy control in future, -1=print_all.
   int print_num = 3;
   CheckNanInf(tensor_.data<T>(), tensor_.numel(), print_num, op_type_,
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index 96d1a9fb94927..a9ea336e42545 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -123,7 +123,11 @@ __global__ void CheckNanInfKernel(const T* value, const size_t numel,
 template <>
 template <typename T>
 void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
-    typename std::enable_if<std::is_floating_point<T>::value>::type*) const {
+    typename std::enable_if<
+        std::is_floating_point<T>::value ||
+        std::is_same<T, ::paddle::platform::complex<float>>::value ||
+        std::is_same<T, ::paddle::platform::complex<double>>::value>::type*)
+    const {
   int print_num = 3;
 
   auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h
index b4459e5a7c1cc..10b7ab0bc9c53 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.h
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h
@@ -46,8 +46,12 @@ struct TensorCheckerVisitor {
   }
 
   template <typename T>
-  void apply(typename std::enable_if<std::is_floating_point<T>::value>::type* =
-                 0) const;
+  void apply(
+      typename std::enable_if<
+          std::is_floating_point<T>::value ||
+          std::is_same<T, ::paddle::platform::complex<float>>::value ||
+          std::is_same<T, ::paddle::platform::complex<double>>::value>::type* =
+          0) const;
 
   std::string op_type_;
   std::string var_name_;
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index cd5de19bdc088..84369011476c7 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -29,7 +29,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
-#include "paddle/fluid/framework/heter_service.h"
+#include "paddle/fluid/framework/heter_util.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -266,6 +266,9 @@ class HogwildWorker : public CPUWorkerBase {
   HogwildWorkerParameter param_;
   std::vector<std::string> skip_ops_;
   std::map<std::string, int> stat_var_name_map_;
+#ifdef PADDLE_WITH_HETERPS
+  platform::DeviceContext* dev_ctx_ = nullptr;
+#endif
 };
 
 class DownpourWorker : public HogwildWorker {
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 654b88920acaf..181e3b6885380 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -141,6 +141,7 @@ message PipelineConfig {
 
 message TensorParallelConfig {
   optional int32 tensor_parallel_degree = 1 [ default = 1 ];
+  optional int32 tensor_init_seed = 2 [ default = -1 ];
 }
 
 message DistributedStrategy {
@@ -172,8 +173,9 @@ message DistributedStrategy {
   optional bool fp16_allreduce = 25 [ default = false ];
   optional bool sharding = 26 [ default = false ];
   optional float last_comm_group_size_MB = 27 [ default = 1 ];
-  optional bool find_unused_parameters = 28 [ default = true ];
+  optional bool find_unused_parameters = 28 [ default = false ];
   optional bool tensor_parallel = 29 [ default = false ];
+  optional bool without_graph_optimization = 30 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index b99ab6b5a7ff1..54d8fc92b2945 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -28,9 +28,19 @@ namespace internal {
 template <typename T>
 static ::DLDataType GetDLDataTypeCode() {
   ::DLDataType dtype;
-  if (std::is_same<T, platform::float16>::value ||
-      std::is_same<T, platform::bfloat16>::value ||
-      std::is_floating_point<T>::value) {
+  if (std::is_same<T, platform::complex<float>>::value ||
+      std::is_same<T, platform::complex<double>>::value ||
+      std::is_same<T, platform::complex64>::value ||
+      std::is_same<T, platform::complex128>::value) {
+    // The current dlpack library version is v0.2, and does not define
+    // kDLComplex value. But kDLComplex is defined by 5U in v0.4, so we set
+    // dtype.code to 5U directly here. After the dlpack library version being
+    // upgraded to v0.4, it should be written as follow.
+    // dtype.code = kDLComplex;
+    dtype.code = 5U;
+  } else if (std::is_same<T, platform::float16>::value ||
+             std::is_same<T, platform::bfloat16>::value ||
+             std::is_floating_point<T>::value) {
     dtype.code = kDLFloat;
   } else if (std::is_unsigned<T>::value) {
     dtype.code = kDLUInt;
@@ -87,6 +97,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
         platform::errors::Unimplemented("platform::NPUPlace is not supported"));
   }
 
+  inline ::DLContext operator()(const platform::NPUPinnedPlace &place) const {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "platform::NPUPinnedPlace is not supported"));
+  }
+
   inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     ::DLContext ctx;
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index d03437034d62a..1a79ada0be7c6 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -28,6 +28,13 @@ namespace framework {
 namespace {  // NOLINT
 template <typename T>
 constexpr uint8_t GetDLDataTypeCode() {
+  if (std::is_same<T, platform::complex<float>>::value ||
+      std::is_same<T, platform::complex<double>>::value ||
+      std::is_same<T, platform::complex64>::value ||
+      std::is_same<T, platform::complex128>::value) {
+    return static_cast<uint8_t>(5);
+  }
+
   return std::is_same<platform::float16, T>::value ||
                  std::is_floating_point<T>::value
              ? static_cast<uint8_t>(kDLFloat)
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 7593b60abfffc..9c9f29520de43 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -20,14 +20,12 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/trainer.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h
index 782018d1cfe10..3beeacb1010d2 100644
--- a/paddle/fluid/framework/executor_cache.h
+++ b/paddle/fluid/framework/executor_cache.h
@@ -22,8 +22,10 @@
 #include <vector>
 
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 03dd2cff655c0..a9e4691dd0a01 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -1,5 +1,10 @@
 if(WITH_PSLIB)
-    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope pslib_brpc pslib)
+    if(WITH_PSLIB_BRPC)
+        set(BRPC_DEPS pslib_brpc)
+    else()
+        set(BRPC_DEPS brpc)
+    endif(WITH_PSLIB_BRPC)
+    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope ${BRPC_DEPS} pslib)
 else()
     cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
 endif(WITH_PSLIB)
@@ -7,11 +12,11 @@ endif(WITH_PSLIB)
 if(WITH_HETERPS)
     if(WITH_NCCL)
         nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
-        DEPS heter_ps)
+        DEPS heter_ps ${BRPC_DEPS})
         add_subdirectory(heter_ps)
     elseif(WITH_RCCL)
         hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
-        DEPS heter_ps)
+        DEPS heter_ps ${BRPC_DEPS})
         add_subdirectory(heter_ps)
     endif(WITH_NCCL)
 else()
@@ -39,7 +44,17 @@ else()
     cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
 endif(WITH_GLOO)
 
-cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto)
+if(WITH_PSLIB)
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(DISTRIBUTE_COMPILE_FLAGS
+            "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+endif()
+set_source_files_properties(heter_wrapper.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+endif()
+
+cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto
+device_context heter_service_proto ${BRPC_DEPS})
 
 cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
 
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index 613b2803637d2..09f7801b19f98 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -28,7 +28,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/framework/heter_service.h"
+#include "paddle/fluid/framework/heter_util.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index 089130f6da8c7..3782e14ad41a5 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -17,16 +17,16 @@ limitations under the License. */
 #include <limits>
 #include <memory>
 #include <vector>
-#ifdef PADDLE_WTIH_PSLIB
+#ifdef PADDLE_WITH_PSLIB
 #include "common_value.h"  // NOLINT
 #endif
 #ifdef PADDLE_WITH_PSCORE
+#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
 #endif
 #include "thrust/pair.h"
 //#include "cudf/concurrent_unordered_map.cuh.h"
 #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
 #ifdef PADDLE_WITH_HETERPS
-#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
 #include "paddle/fluid/platform/type_defs.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h
index 871d2e251b410..4e529de077593 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_wrapper.h
@@ -25,6 +25,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_PSLIB
 #include "paddle/fluid/framework/heter_service.h"
+#include "paddle/fluid/framework/heter_util.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable_helper.h"
diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h
index 3f65eaf3aa121..7e5bf138d9fa9 100644
--- a/paddle/fluid/framework/heter_service.h
+++ b/paddle/fluid/framework/heter_service.h
@@ -72,299 +72,6 @@ class HeterXpuService : public HeterService {
   std::unordered_map<int, HeterServiceHandler> handler_map_;
 };
 
-enum HeterTaskState { PULL_SPARSE, OP_RUN, XPU, OP_RUN_END, PUSH_GRAD, DONE };
-
-class HeterTask {
- public:
-  void Update() {
-    if (state_ == PULL_SPARSE) {
-      state_ = OP_RUN;
-    } else if (state_ == OP_RUN) {
-      state_ = XPU;
-      // state_ = PUSH_GRAD;
-      // state_ = PUSH_GRAD;
-    } else if (state_ == XPU) {
-      state_ = OP_RUN_END;
-    } else if (state_ == OP_RUN_END) {
-      state_ = PUSH_GRAD;
-    } else if (state_ == PUSH_GRAD) {
-      state_ = DONE;
-    }
-  }
-  void Reset() {
-    total_time = 0;
-    read_time = 0;
-    pack_time = 0;
-    pull_sparse_local_time = 0;
-    op_all_time = 0;
-    xpu_op_time = 0;
-    xpu_wait_time = 0;
-    cpu_op_time = 0;
-    collect_label_time = 0;
-    fill_sparse_time = 0;
-    push_sparse_time = 0;
-    gpu_2_cpu_time = 0;
-    cpu_2_gpu_time = 0;
-    timeline.Reset();
-  }
-  void Show() {
-    std::cout << "features size " << features_.size() << std::endl;
-    for (size_t i = 0; i < features_.size(); ++i) {
-      std::cout << "features[" << i << "] size " << features_[i].size()
-                << std::endl;
-    }
-  }
-  void PackTask(Scope* scope, int taskid, DataFeed* reader, int cur_batch,
-                const ProgramDesc& program);
-  void PackGpuTask(Scope* thread_scope, DataFeed* reader,
-                   const ProgramDesc& program);
-
-  Scope* scope_{nullptr};
-  int taskid_;
-  int cur_batch_;
-  HeterTaskState state_;
-  // cache
-  std::map<uint64_t, std::vector<uint64_t>> features_;
-  std::map<uint64_t, std::vector<float>> feature_labels_;
-  std::map<uint64_t, std::vector<std::vector<float>>> feature_values_;
-  std::map<uint64_t, std::vector<std::vector<float>>> feature_grads_;
-  std::map<uint64_t, std::vector<uint64_t>> sparse_push_keys_;
-  double total_time{0};
-  double read_time{0};
-  double pack_time{0};
-  double pull_sparse_local_time{0};
-  double op_all_time{0};
-  double xpu_op_time{0};
-  double xpu_wait_time{0};
-  double cpu_op_time{0};
-  double collect_label_time{0};
-  double fill_sparse_time{0};
-  double push_sparse_time{0};
-  double gpu_2_cpu_time{0};
-  double cpu_2_gpu_time{0};
-  platform::Timer timeline;
-};
-#endif
-template <class T>
-class HeterObjectPool {
- public:
-  HeterObjectPool() {}
-  virtual ~HeterObjectPool(){};
-  std::shared_ptr<T> Get() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    if (pool_.empty()) {
-      num_ += 1;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      VLOG(3) << "pool construct size: " << num_;
-#endif
-      return std::make_shared<T>();
-    } else {
-      auto ret = pool_.back();
-      pool_.pop_back();
-      return ret;
-    }
-  }
-  void Push(std::shared_ptr<T> data) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    pool_.push_back(std::move(data));
-  }
-  int Size() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return pool_.size();
-  }
-  std::shared_ptr<T>& GetElement(int i) { return pool_[i]; }
-
- private:
-  std::vector<std::shared_ptr<T>> pool_;
-  std::mutex mutex_;
-  int num_{0};
-};
-
-#ifdef PADDLE_WITH_PSLIB
-struct BthreadMutextGuard {
-  BthreadMutextGuard(bthread_mutex_t* rho) {
-    mutex_ = rho;
-    bthread_mutex_lock(mutex_);
-  }
-  ~BthreadMutextGuard() { bthread_mutex_unlock(mutex_); }
-  bthread_mutex_t* mutex_;
-};
-
-template <class T>
-class BtObjectPool {
- public:
-  BtObjectPool() {
-    bthread_mutex_init(&mutex_, NULL);
-    bthread_cond_init(&cond_, NULL);
-  }
-
-  virtual ~BtObjectPool() {
-    bthread_cond_destroy(&cond_);
-    bthread_mutex_destroy(&mutex_);
-  };
-
-  std::shared_ptr<T> Get() {
-    BthreadMutextGuard guard(&mutex_);
-    while (pool_.empty()) {
-      bthread_cond_wait(&cond_, &mutex_);
-    }
-    auto ret = pool_.back();
-    pool_.pop_back();
-    return ret;
-  }
-
-  void Push(std::shared_ptr<T> data) {
-    BthreadMutextGuard guard(&mutex_);
-    pool_.push_back(std::move(data));
-    bthread_cond_signal(&cond_);
-  }
-
-  int Size() { return pool_.size(); }
-
-  std::shared_ptr<T>& GetElement(int i) { return pool_[i]; }
-
- private:
-  std::vector<std::shared_ptr<T>> pool_;
-  bthread_mutex_t mutex_;
-  bthread_cond_t cond_;
-  int num_{0};
-};
-
-template <class K, class T>
-struct HeterNode {
-  K key;
-  T value;
-  HeterNode* prev;
-  HeterNode* next;
-};
-
-template <class K, class T>
-class HeterList {
- public:
-  HeterList() : head_(new HeterNode<K, T>), tail_(new HeterNode<K, T>) {
-    head_->prev = NULL;
-    head_->next = tail_;
-    tail_->prev = head_;
-    tail_->next = NULL;
-    size = 0;
-    cap_ = 1e9;
-  }
-
-  ~HeterList() {
-    delete head_;
-    delete tail_;
-  }
-
-  void SetCap(int num) { cap_ = num; }
-
-  bool TryPut(K& key, T& value) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cond_.wait(lock, [this] { return size < cap_; });
-    if (task_map_.find(key) != task_map_.end()) {
-      task_map_.erase(key);
-      return false;
-    } else {
-      HeterNode<K, T>* node = new HeterNode<K, T>;
-      node->key = key;
-      node->value = value;
-      map_[node->key] = node;
-      attach(node);
-      return true;
-    }
-  }
-
-  bool Put(K& key, T& value) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cond_.wait(lock, [this] { return size < cap_; });
-    HeterNode<K, T>* node = new HeterNode<K, T>;
-    node->key = key;
-    node->value = value;
-    map_[node->key] = node;
-    attach(node);
-    return true;
-  }
-
-  T TryGet(const K& key) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    auto iter = map_.find(key);
-    if (iter != map_.end()) {
-      HeterNode<K, T>* node = iter->second;
-      detach(node);
-      cond_.notify_one();
-      T ret = std::move(node->value);
-      map_.erase(key);
-      delete node;
-      return ret;
-    }
-    task_map_.insert(key);
-    return nullptr;
-  }
-
-  T Get(const K& key) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    auto iter = map_.find(key);
-    if (iter != map_.end()) {
-      HeterNode<K, T>* node = iter->second;
-      detach(node);
-      cond_.notify_one();
-      T ret = std::move(node->value);
-      map_.erase(key);
-      delete node;
-      return ret;
-    }
-    return nullptr;
-  }
-
-  T Get() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    HeterNode<K, T>* node = head_->next;
-    if (node == tail_) {
-      return nullptr;
-    } else {
-      detach(node);
-      cond_.notify_one();
-      T ret = std::move(node->value);
-      map_.erase(node->key);
-      delete node;
-      return ret;
-    }
-  }
-
-  bool Empty() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return head_->next == tail_;
-  }
-
-  int Size() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return size;
-  }
-
- private:
-  void detach(HeterNode<K, T>* node) {
-    node->prev->next = node->next;
-    node->next->prev = node->prev;
-    size--;
-  }
-
-  void attach(HeterNode<K, T>* node) {
-    node->prev = head_;
-    node->next = head_->next;
-    head_->next->prev = node;
-    head_->next = node;
-    size++;
-  }
-
- private:
-  HeterNode<K, T>* head_;
-  HeterNode<K, T>* tail_;
-  std::unordered_map<K, HeterNode<K, T>*> map_;
-  std::unordered_set<K> task_map_;
-  std::mutex mutex_;
-  std::condition_variable cond_;
-  int cap_;
-  int size;
-};
 #endif
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/heter_util.h b/paddle/fluid/framework/heter_util.h
new file mode 100644
index 0000000000000..a08f08428da34
--- /dev/null
+++ b/paddle/fluid/framework/heter_util.h
@@ -0,0 +1,329 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_PSLIB
+#include <fstream>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <thread>         // NOLINT
+#include <unordered_map>  // NOLINT
+#include <unordered_set>  // NOLINT
+#include <vector>
+#include "bthread/bthread.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/timer.h"
+
+namespace paddle {
+namespace framework {
+class DataFeed;
+enum HeterTaskState { PULL_SPARSE, OP_RUN, XPU, OP_RUN_END, PUSH_GRAD, DONE };
+
+class HeterTask {
+ public:
+  HeterTask() {}
+  virtual ~HeterTask(){};
+
+  void Update() {
+    if (state_ == PULL_SPARSE) {
+      state_ = OP_RUN;
+    } else if (state_ == OP_RUN) {
+      state_ = XPU;
+      // state_ = PUSH_GRAD;
+      // state_ = PUSH_GRAD;
+    } else if (state_ == XPU) {
+      state_ = OP_RUN_END;
+    } else if (state_ == OP_RUN_END) {
+      state_ = PUSH_GRAD;
+    } else if (state_ == PUSH_GRAD) {
+      state_ = DONE;
+    }
+  }
+  void Reset() {
+    total_time = 0;
+    read_time = 0;
+    pack_time = 0;
+    pull_sparse_local_time = 0;
+    op_all_time = 0;
+    xpu_op_time = 0;
+    xpu_wait_time = 0;
+    cpu_op_time = 0;
+    collect_label_time = 0;
+    fill_sparse_time = 0;
+    push_sparse_time = 0;
+    gpu_2_cpu_time = 0;
+    cpu_2_gpu_time = 0;
+    timeline.Reset();
+  }
+  void Show() {
+    std::cout << "features size " << features_.size() << std::endl;
+    for (size_t i = 0; i < features_.size(); ++i) {
+      std::cout << "features[" << i << "] size " << features_[i].size()
+                << std::endl;
+    }
+  }
+  void PackTask(Scope* scope, int taskid, DataFeed* reader, int cur_batch,
+                const ProgramDesc& program);
+  void PackGpuTask(Scope* thread_scope, DataFeed* reader,
+                   const ProgramDesc& program);
+
+  Scope* scope_{nullptr};
+  int taskid_;
+  int cur_batch_;
+  HeterTaskState state_;
+  // cache
+  std::map<uint64_t, std::vector<uint64_t>> features_;
+  std::map<uint64_t, std::vector<float>> feature_labels_;
+  std::map<uint64_t, std::vector<std::vector<float>>> feature_values_;
+  std::map<uint64_t, std::vector<std::vector<float>>> feature_grads_;
+  std::map<uint64_t, std::vector<uint64_t>> sparse_push_keys_;
+  double total_time{0};
+  double read_time{0};
+  double pack_time{0};
+  double pull_sparse_local_time{0};
+  double op_all_time{0};
+  double xpu_op_time{0};
+  double xpu_wait_time{0};
+  double cpu_op_time{0};
+  double collect_label_time{0};
+  double fill_sparse_time{0};
+  double push_sparse_time{0};
+  double gpu_2_cpu_time{0};
+  double cpu_2_gpu_time{0};
+  platform::Timer timeline;
+};
+#endif
+template <class T>
+class HeterObjectPool {
+ public:
+  HeterObjectPool() {}
+  virtual ~HeterObjectPool(){};
+  std::shared_ptr<T> Get() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (pool_.empty()) {
+      num_ += 1;
+      return std::make_shared<T>();
+    } else {
+      auto ret = pool_.back();
+      pool_.pop_back();
+      return ret;
+    }
+  }
+  void Push(std::shared_ptr<T> data) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    pool_.push_back(std::move(data));
+  }
+  int Size() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return pool_.size();
+  }
+  std::shared_ptr<T>& GetElement(int i) { return pool_[i]; }
+
+ private:
+  std::vector<std::shared_ptr<T>> pool_;
+  std::mutex mutex_;
+  int num_{0};
+};
+
+#ifdef PADDLE_WITH_PSLIB
+struct BthreadMutextGuard {
+  BthreadMutextGuard(bthread_mutex_t* rho) {
+    mutex_ = rho;
+    bthread_mutex_lock(mutex_);
+  }
+  ~BthreadMutextGuard() { bthread_mutex_unlock(mutex_); }
+  bthread_mutex_t* mutex_;
+};
+
+template <class T>
+class BtObjectPool {
+ public:
+  BtObjectPool() {
+    bthread_mutex_init(&mutex_, NULL);
+    bthread_cond_init(&cond_, NULL);
+  }
+
+  virtual ~BtObjectPool() {
+    bthread_cond_destroy(&cond_);
+    bthread_mutex_destroy(&mutex_);
+  };
+
+  std::shared_ptr<T> Get() {
+    BthreadMutextGuard guard(&mutex_);
+    while (pool_.empty()) {
+      bthread_cond_wait(&cond_, &mutex_);
+    }
+    auto ret = pool_.back();
+    pool_.pop_back();
+    return ret;
+  }
+
+  void Push(std::shared_ptr<T> data) {
+    BthreadMutextGuard guard(&mutex_);
+    pool_.push_back(std::move(data));
+    bthread_cond_signal(&cond_);
+  }
+
+  int Size() { return pool_.size(); }
+
+  std::shared_ptr<T>& GetElement(int i) { return pool_[i]; }
+
+ private:
+  std::vector<std::shared_ptr<T>> pool_;
+  bthread_mutex_t mutex_;
+  bthread_cond_t cond_;
+  int num_{0};
+};
+
+template <class K, class T>
+struct HeterNode {
+  K key;
+  T value;
+  HeterNode* prev;
+  HeterNode* next;
+};
+
+template <class K, class T>
+class HeterList {
+ public:
+  HeterList() : head_(new HeterNode<K, T>), tail_(new HeterNode<K, T>) {
+    head_->prev = NULL;
+    head_->next = tail_;
+    tail_->prev = head_;
+    tail_->next = NULL;
+    size = 0;
+    cap_ = 1e9;
+  }
+
+  ~HeterList() {
+    delete head_;
+    delete tail_;
+  }
+
+  void SetCap(int num) { cap_ = num; }
+
+  bool TryPut(K& key, T& value) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cond_.wait(lock, [this] { return size < cap_; });
+    if (task_map_.find(key) != task_map_.end()) {
+      task_map_.erase(key);
+      return false;
+    } else {
+      HeterNode<K, T>* node = new HeterNode<K, T>;
+      node->key = key;
+      node->value = value;
+      map_[node->key] = node;
+      attach(node);
+      return true;
+    }
+  }
+
+  bool Put(K& key, T& value) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cond_.wait(lock, [this] { return size < cap_; });
+    HeterNode<K, T>* node = new HeterNode<K, T>;
+    node->key = key;
+    node->value = value;
+    map_[node->key] = node;
+    attach(node);
+    return true;
+  }
+
+  T TryGet(const K& key) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto iter = map_.find(key);
+    if (iter != map_.end()) {
+      HeterNode<K, T>* node = iter->second;
+      detach(node);
+      cond_.notify_one();
+      T ret = std::move(node->value);
+      map_.erase(key);
+      delete node;
+      return ret;
+    }
+    task_map_.insert(key);
+    return nullptr;
+  }
+
+  T Get(const K& key) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto iter = map_.find(key);
+    if (iter != map_.end()) {
+      HeterNode<K, T>* node = iter->second;
+      detach(node);
+      cond_.notify_one();
+      T ret = std::move(node->value);
+      map_.erase(key);
+      delete node;
+      return ret;
+    }
+    return nullptr;
+  }
+
+  T Get() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    HeterNode<K, T>* node = head_->next;
+    if (node == tail_) {
+      return nullptr;
+    } else {
+      detach(node);
+      cond_.notify_one();
+      T ret = std::move(node->value);
+      map_.erase(node->key);
+      delete node;
+      return ret;
+    }
+  }
+
+  bool Empty() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return head_->next == tail_;
+  }
+
+  int Size() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return size;
+  }
+
+ private:
+  void detach(HeterNode<K, T>* node) {
+    node->prev->next = node->next;
+    node->next->prev = node->prev;
+    size--;
+  }
+
+  void attach(HeterNode<K, T>* node) {
+    node->prev = head_;
+    node->next = head_->next;
+    head_->next->prev = node;
+    head_->next = node;
+    size++;
+  }
+
+ private:
+  HeterNode<K, T>* head_;
+  HeterNode<K, T>* tail_;
+  std::unordered_map<K, HeterNode<K, T>*> map_;
+  std::unordered_set<K> task_map_;
+  std::mutex mutex_;
+  std::condition_variable cond_;
+  int cap_;
+  int size;
+};
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/heterbox_worker.cc b/paddle/fluid/framework/heterbox_worker.cc
index 726b651fcf4ec..b7df88218cbd4 100644
--- a/paddle/fluid/framework/heterbox_worker.cc
+++ b/paddle/fluid/framework/heterbox_worker.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include "paddle/fluid/framework/fleet/heter_wrapper.h"
+#include "paddle/fluid/framework/heter_util.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
index 5e1fabf2038cc..8049a1c9424be 100644
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
+#include "paddle/fluid/framework/fleet/heter_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
 #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
     (defined PADDLE_WITH_PSLIB)
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index 89dc5c7d3ea93..b2d170888e28f 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -39,6 +39,9 @@ void HogwildWorker::Initialize(const TrainerDesc &desc) {
   for (int i = 0; i < param_.stat_var_names_size(); ++i) {
     stat_var_name_map_[param_.stat_var_names(i)] = 1;
   }
+#ifdef PADDLE_WITH_HETERPS
+  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
+#endif
 }
 
 void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) {
@@ -150,6 +153,9 @@ void HogwildWorker::TrainFilesWithProfiler() {
       VLOG(3) << "Going to run op " << op_name[i];
       if (!need_skip) {
         ops_[i]->Run(*thread_scope_, place_);
+#ifdef PADDLE_WITH_HETERPS
+        dev_ctx_->Wait();
+#endif
       }
       VLOG(3) << "Op " << op_name[i] << " Finished";
       timeline.Pause();
@@ -167,6 +173,16 @@ void HogwildWorker::TrainFilesWithProfiler() {
     total_inst += cur_batch;
     ++batch_cnt;
     PrintFetchVars();
+#ifdef PADDLE_WITH_HETERPS
+    dev_ctx_->Wait();
+    VLOG(1) << "GpuPs worker " << thread_id_ << " train cost " << total_time
+            << " seconds, ins_num: " << total_inst;
+    for (size_t i = 0; i < op_name.size(); ++i) {
+      VLOG(1) << "card:" << thread_id_ << ", op: " << op_name[i]
+              << ", mean time: " << op_total_time[i] / total_inst
+              << "s, totol time:" << op_total_time[i] << "sec";
+    }
+#else
     if (thread_id_ == 0) {
       if (batch_cnt > 0 && batch_cnt % 100 == 0) {
         for (size_t i = 0; i < ops_.size(); ++i) {
@@ -178,6 +194,7 @@ void HogwildWorker::TrainFilesWithProfiler() {
         fprintf(stderr, "%6.2f instances/s\n", total_inst / total_time);
       }
     }
+#endif
     thread_scope_->DropKids();
     timeline.Start();
   }
@@ -195,7 +212,10 @@ void HogwildWorker::TrainFilesWithProfiler() {
 
 void HogwildWorker::TrainFiles() {
   platform::SetNumThreads(1);
+  platform::Timer timeline;
+  timeline.Start();
 
+  int total_ins_num = 0;
   // how to accumulate fetched values here
   device_reader_->Start();
   int cur_batch;
@@ -213,9 +233,13 @@ void HogwildWorker::TrainFiles() {
       }
     }
 
+    total_ins_num += cur_batch;
     PrintFetchVars();
     thread_scope_->DropKids();
   }
+  timeline.Pause();
+  VLOG(3) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
+          << " seconds, ins_num: " << total_ins_num;
 #if defined PADDLE_WITH_PSCORE
   if (thread_barrier_) {
     paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement();
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index ab69170322ce3..01536fd36ff83 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -50,6 +50,7 @@ if (WITH_TESTING)
 endif(WITH_TESTING)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS ${GRAPH_PATTERN_DETECTOR_DEPS})
 
+cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector)
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS graph_pattern_detector executor)
 cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass)
@@ -139,6 +140,7 @@ cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
+cc_test(test_op_compat_sensible_pass SRCS op_compat_sensible_pass_tester.cc DEPS op_compat_sensible_pass)
 cc_test(test_fc_fuse_pass_cc SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_fc_lstm_fuse_pass_cc SRCS fc_lstm_fuse_pass_tester.cc DEPS fc_lstm_fuse_pass framework_proto)
 cc_test(test_fc_gru_fuse_pass_cc SRCS fc_gru_fuse_pass_tester.cc DEPS fc_gru_fuse_pass framework_proto)
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
new file mode 100644
index 0000000000000..f7312ca555531
--- /dev/null
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
@@ -0,0 +1,178 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+
+#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+AttrCompat& AttrCompat::IsStringIn(const std::set<std::string>& candidates) {
+  conditions_.emplace_back([candidates](const Attribute& attr) -> bool {
+    std::string value = BOOST_GET_CONST(std::string, attr);
+    for (auto& str : candidates) {
+      if (str == value) {
+        return true;
+      }
+    }
+    return false;
+  });
+  return *this;
+}
+
+AttrCompat& AttrCompat::IsStringMatch(
+    const std::function<bool(const std::string&)>& func) {
+  conditions_.emplace_back([func](const Attribute& attr) -> bool {
+    std::string value = BOOST_GET_CONST(std::string, attr);
+    return func(value);
+  });
+  return *this;
+}
+
+AttrCompat& AttrCompat::IsIntIn(const std::set<int>& candidates) {
+  conditions_.emplace_back([candidates](const Attribute& attr) -> bool {
+    int value = BOOST_GET_CONST(int, attr);
+    return candidates.find(value) != candidates.end();
+  });
+  return *this;
+}
+
+//! Todo: append the definition.
+AttrCompat& AttrCompat::IsLeftDefault() { return *this; }
+
+bool AttrCompat::operator()(const OpDesc& op_desc) {
+  if (!op_desc.HasAttr(attr_name_)) {
+    return false;
+  }
+  const Attribute attr = op_desc.GetAttr(attr_name_);
+  for (auto& func : conditions_) {
+    if (!func(attr)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+AttrCompat& AttrCompat::IsBoolEQ(bool v) {
+  conditions_.emplace_back([v](const Attribute& attr) -> bool {
+    bool value = BOOST_GET_CONST(bool, attr);
+    return value == v;
+  });
+  return *this;
+}
+
+InputOrOutputCompat& InputOrOutputCompat::IsTensor() {
+  conditions_.emplace_back([](const std::vector<std::string>& input) -> bool {
+    return input.size() == 1u;
+  });
+  return *this;
+}
+
+InputOrOutputCompat& InputOrOutputCompat::IsOptional() {
+  optional_ = true;
+  return *this;
+}
+
+bool InputOrOutputCompat::operator()(
+    const std::vector<std::string>& input) const {
+  if (input.empty()) return false;
+  for (auto& func : conditions_) {
+    if (!func(input)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+AttrCompat& OpCompat::AddAttr(const std::string& attr_name) {
+  attr_compats_.emplace_back(attr_name, this);
+  return attr_compats_.back();
+}
+
+InputOrOutputCompat& OpCompat::AddInput(const std::string& name) {
+  PADDLE_ENFORCE_EQ(input_compats_.find(name), input_compats_.end(),
+                    platform::errors::InvalidArgument(
+                        "The input with the same name has been added"));
+  input_compats_.emplace(name, InputOrOutputCompat(name, this));
+  return input_compats_.at(name);
+}
+
+InputOrOutputCompat& OpCompat::AddOutput(const std::string& name) {
+  PADDLE_ENFORCE_EQ(output_compats_.find(name), output_compats_.end(),
+                    platform::errors::InvalidArgument(
+                        "The output with the same name has been added"));
+  output_compats_.emplace(name, InputOrOutputCompat(name, this));
+  return output_compats_.at(name);
+}
+
+bool OpCompat::Judge(const OpDesc& op_desc) {
+  for (auto& attr_compat : attr_compats_) {
+    if (!attr_compat(op_desc)) {
+      return false;
+    }
+  }
+
+  const VariableNameMap& inputs_map = op_desc.Inputs();
+  for (auto& input_desc : inputs_map) {
+    if (input_compats_.find(input_desc.first) == input_compats_.end()) {
+      if (!input_desc.second.empty()) {
+        return false;
+      }
+    }
+  }
+  for (auto& input_val : input_compats_) {
+    if (inputs_map.find(input_val.first) == inputs_map.end()) {
+      if (!input_val.second.Optional()) {
+        return false;
+      }
+    } else {
+      if (!input_val.second(inputs_map.at(input_val.first))) {
+        return false;
+      }
+    }
+  }
+
+  const VariableNameMap& outputs_map = op_desc.Outputs();
+  for (auto& output_desc : outputs_map) {
+    if (output_compats_.find(output_desc.first) == output_compats_.end()) {
+      if (!output_desc.second.empty()) {
+        return false;
+      }
+    }
+  }
+  for (auto& output_val : output_compats_) {
+    if (outputs_map.find(output_val.first) == outputs_map.end()) {
+      if (!output_val.second.Optional()) {
+        return false;
+      }
+    } else {
+      if (!output_val.second(outputs_map.at(output_val.first))) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+OpCompat& OpCompatSensiblePass::AddOpCompat(OpCompat&& op_compat) {
+  std::string name = op_compat.Name();
+  op_compat_judgers_[name].reset(new OpCompat(std::move(op_compat)));
+  return *(op_compat_judgers_[name]);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.h b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
new file mode 100644
index 0000000000000..6c0860549fbfe
--- /dev/null
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
@@ -0,0 +1,294 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class OpCompat;
+
+class AttrCompat {
+ public:
+  AttrCompat(const std::string& attr_name, OpCompat* op_compat)
+      : attr_name_(attr_name), op_compat_(op_compat) {}
+
+  // @{ String-related methods
+  //! Assert the attribute is an string in the `candidates` domain.
+  AttrCompat& IsStringIn(const std::set<std::string>& candidates);
+  //! Assert the attribute is a string and match a custom judging function.
+  AttrCompat& IsStringMatch(
+      const std::function<bool(const std::string&)>& func);
+  // @}
+
+  //! Assert the attribute is an integer in the `candidates` domain.
+  AttrCompat& IsIntIn(const std::set<int>& candidates);
+
+  // @{ Number-releated methods
+  //! Assert the attribute is a number and > `v`.
+  template <typename T>
+  AttrCompat& IsNumGT(T v);
+  //! Assert the attribute is a number and >= `v`.
+  template <typename T>
+  AttrCompat& IsNumGE(T v);
+  //! Assert the attribute is a number and < `v`.
+  template <typename T>
+  AttrCompat& IsNumLT(T v);
+  //! Assert the attribute is a number and <= `v`.
+  template <typename T>
+  AttrCompat& IsNumLE(T v);
+  //! Assert the attribute is a number and == `v`.
+  template <typename T>
+  AttrCompat& IsNumEQ(T v);
+  //! Assert the attribute is a number and matches a customized judging
+  //! function.
+  template <typename T>
+  AttrCompat& IsNumMatch(bool (*func)(T));
+  // @}
+
+  //! Assert the attribute is a boolean value equals `v`.
+  AttrCompat& IsBoolEQ(bool v);
+
+  //! Tell whether this attribute is left as default value.
+  AttrCompat& IsLeftDefault();
+
+  //! Jump back to retrieve OpCompat instance.
+  OpCompat& End() { return *op_compat_; }
+
+  bool operator()(const OpDesc& op_desc);
+
+ private:
+  std::string attr_name_;
+  OpCompat* op_compat_;
+  std::vector<std::function<bool(const Attribute&)>> conditions_;
+};
+
+class InputOrOutputCompat {
+ public:
+  InputOrOutputCompat(const std::string& name, OpCompat* op_compat)
+      : optional_(false), name_(name), op_compat_(op_compat) {}
+
+  InputOrOutputCompat& IsTensor();
+  InputOrOutputCompat& IsOptional();
+  bool Optional() const { return optional_; }
+  bool operator()(const std::vector<std::string>& input) const;
+
+  //! Jump back to retrieve OpCompat instance.
+  OpCompat& End() { return *op_compat_; }
+
+ private:
+  bool optional_;
+  std::string name_;
+  OpCompat* op_compat_;
+  std::vector<std::function<bool(const std::vector<std::string>&)>> conditions_;
+};
+
+/**
+ * OpCompat is a helper class to help define the compatible Op definition.
+ *
+ * Usage:
+ *   OpCompat compat("FC");
+ *   compat.AddAttr("in_num_col_dims").IsNumLE(1).End()
+ *         .AddAttr("activation_type").IsStringIn({"tanh", "sigmoid"}).End()
+ *         .AddInput("Input").IsTensor().End()
+ *         .AddInput("W").IsTensor().End()
+ *         .AddInput("Bias").IsTensor().IsOptional().End()
+ *         .AddOutput("Out").IsTensor().End()
+ *
+ * All the inference-aware Op defition is as above, all the other attributes not
+ * contained in the definition should be set default value or it would be judged
+ * incompatible.
+ */
+class OpCompat {
+ public:
+  explicit OpCompat(const std::string& op_name) : op_name_(op_name) {}
+  explicit OpCompat(std::string&& op_name) : op_name_(std::move(op_name)) {}
+  explicit OpCompat(const OpCompat&) = default;
+  explicit OpCompat(OpCompat&&) = default;
+
+  AttrCompat& AddAttr(const std::string& attr_name);
+  InputOrOutputCompat& AddInput(const std::string& name);
+  InputOrOutputCompat& AddOutput(const std::string& name);
+
+  //! Judge whether an OpDesc match the defined Op compatibility.
+  bool Judge(const OpDesc& op_desc);
+  const std::string& Name() const { return op_name_; }
+
+ private:
+  std::string op_name_;
+  std::vector<AttrCompat> attr_compats_;
+  std::unordered_map<std::string, InputOrOutputCompat> input_compats_;
+  std::unordered_map<std::string, InputOrOutputCompat> output_compats_;
+};
+
+/**
+ * OpCompatSensiblePass is a base class for all the passes thouse is sensitive
+ * to Op update.
+ * There are two methods to help tell the compability of an Op
+ *   bool IsCompat(const GraphPatternDetector::subgraph_t& subgraph, Graph* g);
+ *   bool IsCompat(const OpDesc& op_desc);
+ *
+ * One can register the related Op compabilities using
+ *   void AddOpCompat(OpCompat&& judger);
+ *
+ * Most of the Passes are used for fusing ops, so we define a method for such
+ * scenerios.
+ *   void AccessSubgraph(const GraphPatternDetector::subgraph_t& subgraph,
+ Graph* g);
+ * It will check the Op compatibility automatically.
+ * For other scenirios, one should call `IsCompat` by himself.
+ *
+ * A FC fuse pass example:
+ * class FcFusePass : public OpCompatSensiblePass {
+ *  public:
+ *   FcFusePass() {
+ *     // define Mul op compatiblity.
+ *     AddOpCompat(OpCompat("Mul"))
+ *        .AddInput("Input").IsTensor().End()
+ *        .AddAttr("in_num_col_dims").IsNumGE(1);
+ *     AddOpCompat(OpCompat("Add")). ...;
+ *     // There are multiple activation implemention.
+ *     AddOpCompat(OpCompat("Tanh")). ...;
+ *     AddOpCompat(OpCompat("Sigmoid")). ...;
+ *   }
+ *
+ *   // override the subgraph access method
+ *   virtual bool AccessSubgraphImpl(
+ *   const GraphPatternDetector::subgraph_t& subgraph,
+ *         Graph* g) override { ... }
+ *
+ *   // Call the AccessSubgraph method in main procedure of this Pass.
+ * };
+ */
+class OpCompatSensiblePass : public Pass {
+ public:
+  //! Access the subgraph and pattern.
+  void AccessSubgraph(const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* g) {
+    if (IsCompat(subgraph, g)) {
+      AccessSubgraphImpl(subgraph, g);
+    }
+  }
+
+ protected:
+  /**
+   * Developer should push the compatibility `teller` for each kind of Op in the
+   * subgraph.
+   * NOTE One should add all the related op compatiblity in the construct so
+   * that all the following methods are valid.
+   */
+  OpCompat& AddOpCompat(OpCompat&& op_compat);
+
+  //! Modify the subgraph.
+  virtual bool AccessSubgraphImpl(
+      const GraphPatternDetector::subgraph_t& subgraph, Graph* g) const {
+    return true;
+  }
+
+  //! Tell the Op compability of a subgraph.
+  bool IsCompat(const GraphPatternDetector::subgraph_t& subgraph,
+                Graph* g) const {
+    CHECK(!op_compat_judgers_.empty())
+        << "At least one OpCompat instance should be added in the "
+           "OpCompatSensiblePass.";
+    // Check the all the ops in the subgraph are contained in the
+    // op_compat.
+    for (auto& node_pair : subgraph) {
+      if (!node_pair.first->IsOp()) continue;
+      auto op_type = node_pair.second->Op()->Type();
+      if (!op_compat_judgers_.count(op_type)) {
+        return false;
+      }
+      auto& judger = *op_compat_judgers_.at(op_type);
+      if (!judger.Judge(*(node_pair.second->Op()))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  //! Tell the op compatibility of a single Op.
+  bool IsCompat(const OpDesc& op_desc) const {
+    if (!op_compat_judgers_.count(op_desc.Type())) return false;
+    return op_compat_judgers_.at(op_desc.Type())->Judge(op_desc);
+  }
+
+ private:
+  std::map<std::string, std::unique_ptr<OpCompat>> op_compat_judgers_;
+};
+
+template <typename T>
+AttrCompat& AttrCompat::IsNumGT(T v) {
+  conditions_.emplace_back([v](const Attribute& attr) -> bool {
+    T value = BOOST_GET_CONST(T, attr);
+    return value > v;
+  });
+  return *this;
+}
+
+template <typename T>
+AttrCompat& AttrCompat::IsNumGE(T v) {
+  conditions_.emplace_back([v](const Attribute& attr) -> bool {
+    T value = BOOST_GET_CONST(T, attr);
+    return value >= v;
+  });
+  return *this;
+}
+
+template <typename T>
+AttrCompat& AttrCompat::IsNumLT(T v) {
+  conditions_.emplace_back([v](const Attribute& attr) -> bool {
+    T value = BOOST_GET_CONST(T, attr);
+    return value < v;
+  });
+  return *this;
+}
+
+template <typename T>
+AttrCompat& AttrCompat::IsNumLE(T v) {
+  conditions_.emplace_back([v](const Attribute& attr) -> bool {
+    T value = BOOST_GET_CONST(T, attr);
+    return value <= v;
+  });
+  return *this;
+}
+
+template <typename T>
+AttrCompat& AttrCompat::IsNumEQ(T v) {
+  conditions_.emplace_back([v](const Attribute& attr) -> bool {
+    T value = BOOST_GET_CONST(T, attr);
+    return value == v;
+  });
+  return *this;
+}
+
+template <typename T>
+AttrCompat& AttrCompat::IsNumMatch(bool (*func)(T)) {
+  conditions_.emplace_back([func](const Attribute& attr) -> bool {
+    T value = BOOST_GET_CONST(T, attr);
+    return func(value);
+  });
+  return *this;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
new file mode 100644
index 0000000000000..3d0863a6d12d9
--- /dev/null
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+TEST(OpCompatSensiblePass, compatOp) {
+  auto lambda = [](const std::string& str) { return str == "tanh"; };
+  OpCompat compat("FC");
+  compat.AddAttr("in_num_col_dims")
+      .IsIntIn({1, 2})
+      .IsNumLE(1)
+      .IsLeftDefault()
+      .End()
+      .AddAttr("activation_type")
+      .IsStringIn({"tanh", "sigmoid"})
+      .IsStringMatch(lambda)
+      .End()
+      .AddAttr("test_attr")
+      .IsBoolEQ(true)
+      .End()
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("W")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("Test")
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+
+  OpDesc fc_op;
+
+  std::unordered_map<std::string, Attribute> attr_map;
+  attr_map["in_num_col_dims"] = 1;
+  attr_map["activation_type"] = std::string("tanh");
+  attr_map["test_attr"] = true;
+
+  fc_op.SetAttrMap(attr_map);
+
+  fc_op.SetInput("Input", std::vector<std::string>{"test_input"});
+  fc_op.SetInput("W", std::vector<std::string>{"test_input_0"});
+  fc_op.SetInput("Bias", std::vector<std::string>{"test_input_1"});
+  fc_op.SetOutput("Out", std::vector<std::string>{"test_output"});
+
+  EXPECT_STREQ(compat.Name().c_str(), "FC");
+  EXPECT_TRUE(compat.Judge(fc_op));
+}
+
+class OpCompatSensiblePassTest : public OpCompatSensiblePass {
+ public:
+  OpCompatSensiblePassTest();
+  bool TestIsCompat(const OpDesc& op_desc) { return IsCompat(op_desc); }
+};
+
+OpCompatSensiblePassTest::OpCompatSensiblePassTest() {
+  AddOpCompat(OpCompat("FC"))
+      .AddAttr("in_num_col_dims")
+      .IsNumLE(1)
+      .End()
+      .AddAttr("activation_type")
+      .IsStringIn({"tanh", "sigmoid"})
+      .End()
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("W")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor();
+}
+
+TEST(OpCompatSensiblePass, IsCompat) {
+  OpCompatSensiblePassTest test;
+  OpDesc fc_op;
+  fc_op.SetType("FC");
+  std::unordered_map<std::string, Attribute> attr_map;
+  attr_map["in_num_col_dims"] = 1;
+  attr_map["activation_type"] = std::string("tanh");
+
+  fc_op.SetAttrMap(attr_map);
+  fc_op.SetInput("Input", std::vector<std::string>{"test_input"});
+  fc_op.SetInput("W", std::vector<std::string>{"test_input_0"});
+  fc_op.SetInput("Bias", std::vector<std::string>{"test_input_1"});
+  fc_op.SetOutput("Out", std::vector<std::string>{"test_output"});
+
+  EXPECT_TRUE(test.TestIsCompat(fc_op));
+
+  ProgramDesc prog;
+  std::unique_ptr<Graph> g(new Graph(prog));
+  Node* o1 = g->CreateOpNode(&fc_op);
+
+  GraphPatternDetector detector;
+  PDNode* op2 =
+      detector.mutable_pattern()->NewNode([](Node* x) { return true; });
+  GraphPatternDetector::subgraph_t subgraph;
+  subgraph[op2] = o1;
+
+  test.AccessSubgraph(subgraph, g.get());
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index 198bb65863bb6..7afa76c3fbd23 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -176,6 +176,7 @@ void MultiTrainer::Run() {
 
 #ifdef PADDLE_WITH_HETERPS
 void MultiTrainer::MergeDenseParam() {
+#ifdef PADDLE_WTIH_PSCORE
   auto communicator = paddle::distributed::Communicator::GetInstance();
   auto& recv_ctx = communicator->GetRecvCtxMap();
   Scope* thread_scope = workers_[0]->GetThreadScope();
@@ -189,6 +190,7 @@ void MultiTrainer::MergeDenseParam() {
       TensorCopy((*tensor), root_tensor->place(), root_tensor);
     }
   }
+#endif
 }
 #endif
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 955c917b2c1bf..25d430df45825 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1228,6 +1228,8 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
       // will be executed and a warning will be given at the same time.
       if (SupportGPU()) {
         expected_kernel_key.place_ = dev_ctx->GetPlace();
+      } else if (SupportNPU()) {
+        expected_kernel_key.place_ = dev_ctx->GetPlace();
       } else {
         expected_kernel_key.place_ = platform::CPUPlace();
         LOG_FIRST_N(WARNING, 1)
@@ -1299,7 +1301,11 @@ void OperatorWithKernel::TransferInplaceVarsBack(
     auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
     auto original_dims = original_tensor->dims();
     original_tensor->ShareDataWith(*transformed_tensor);
-    original_tensor->Resize(original_dims);
+    // In order to solve the problem that the output latitude of NPU reshape
+    // operator is not changed when inplace.
+    if (type_ != "reshape2" && type_ != "reshape2_grad") {
+      original_tensor->Resize(original_dims);
+    }
   }
 }
 
@@ -1549,10 +1555,10 @@ void OperatorWithKernel::ParseInputDataType(
       } else if (var->IsType<SelectedRows>()) {
         t = &(var->Get<SelectedRows>().value());
       } else if (var->IsType<LoDTensorArray>()) {
-        auto t_arr = var->Get<LoDTensorArray>();
-        for (size_t j = 0; j < t_arr.size(); j++) {
-          if (t_arr[j].IsInitialized()) {
-            t = &(t_arr[j]);
+        auto t_arr = &var->Get<LoDTensorArray>();
+        for (size_t j = 0; j < t_arr->size(); j++) {
+          if (t_arr->at(j).IsInitialized()) {
+            t = &(t_arr->at(j));
           }
         }
       }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 3fc61581eca72..fc01513a866e4 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -154,6 +154,7 @@ class OperatorBase {
   std::string DebugString() const { return DebugStringEx(nullptr); }
 
   virtual bool SupportGPU() const { return false; }
+  virtual bool SupportNPU() const { return false; }
 
   const std::string& Type() const { return type_; }
 
@@ -490,6 +491,13 @@ class OperatorWithKernel : public OperatorBase {
                          return platform::is_gpu_place(kern_pair.first.place_);
                        });
   }
+  bool SupportNPU() const override {
+    auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
+    return std::any_of(op_kernels.begin(), op_kernels.end(),
+                       [](OpKernelMap::const_reference kern_pair) {
+                         return platform::is_npu_place(kern_pair.first.place_);
+                       });
+  }
   bool SupportsMKLDNN(proto::VarType::Type data_type) const;
 
   bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx,
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 73a699b41c8e0..eb021609e8258 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -1407,10 +1407,23 @@ std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
             exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
             member_->places_, graph));
       } else {
-        VLOG(3) << "use FastThreadedSSAGraphExecutor";
-        member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-            exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-            member_->places_, graph));
+        if (member_->use_device_ == p::kXPU) {
+#if defined(PADDLE_WITH_XPU)
+          VLOG(3) << "use BindThreadedSSAGraphExecutor";
+          member_->executor_.reset(new details::BindThreadedSSAGraphExecutor(
+              exec_strategy, member_->local_scopes_,
+              member_->local_exec_scopes_, member_->places_, graph));
+#else
+          PADDLE_THROW(platform::errors::PermissionDenied(
+              "Paddle can't use XPU device since it's not compiled with XPU,"
+              "Please recompile or reinstall Paddle with XPU support."));
+#endif
+        } else {
+          VLOG(3) << "use FastThreadedSSAGraphExecutor";
+          member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
+              exec_strategy, member_->local_scopes_,
+              member_->local_exec_scopes_, member_->places_, graph));
+        }
       }
       final_graphs.emplace_back(graph);
     }
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 75c42fa3e5273..3bd50229b94de 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -35,7 +35,7 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
   ParseDumpConfig(trainer_desc);
   const auto& section_config = section_params.section_config();
   int place_id = section_config.place_id();
-#if (defined PADDLE_WITH_NCCL)
+#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL)
   place_ = platform::CUDAPlace(place_id);
 #elif (defined PADDLE_WITH_ASCEND_CL)  // NOLINT
   place_ = platform::NPUPlace(place_id);
diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc
index d178c4e556ca5..66d8a40dda160 100644
--- a/paddle/fluid/framework/ps_gpu_worker.cc
+++ b/paddle/fluid/framework/ps_gpu_worker.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/heter_wrapper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -129,8 +128,6 @@ void PSGPUWorker::Initialize(const TrainerDesc& desc) {
       }
     }
   }
-  // pull_queue_ = paddle::framework::MakeChannel<std::shared_ptr<HeterTask>>();
-  // push_queue_ = paddle::framework::MakeChannel<std::shared_ptr<HeterTask>>();
 }
 
 void PSGPUWorker::SetChannelWriter(ChannelObject<std::string>* queue) {
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 00ff50abadd18..993b9ac52c5b5 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -110,8 +110,22 @@ void SectionWorker::TrainFiles() {
             BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
       }
     }
+#elif defined(PADDLE_WITH_ASCEND_CL)
+    if (IsFastEagerDeletionModeEnabled()) {
+      VLOG(4) << "Use unsafe fast gc for NPU.";
+      gc.reset(new NPUUnsafeFastGarbageCollector(
+          BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Please set FLAGS_fast_eager_deletion_mode=true to use "
+          "GarbageCollector on NPU."));
+      // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
+      VLOG(4) << "Use default stream gc for NPU.";
+      gc.reset(new NPUDefaultStreamGarbageCollector(
+          BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+    }
 #endif
-  }
+  }  // max_memory_size >= 0
 
   if (schedule_mode_ == 0) {
     // F-then-B scheduler which runs Forward phase for all microbatches,
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 78fd1af09e294..105751645bbc5 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -503,6 +503,11 @@ class AnyVisitor : public boost::static_visitor<bool> {
     // return GetResultHelper(out, npu);
   }
 
+  bool GetResult(const framework::Tensor& out,
+                 const platform::NPUPinnedPlace& cpu) const {
+    return *out.data<bool>();
+  }
+
   bool GetResult(const framework::Tensor& out,
                  const platform::CPUPlace& cpu) const {
     return *out.data<bool>();
@@ -731,6 +736,18 @@ struct BothFalseVisitor : public boost::static_visitor<> {
       out_ptr[i] = lhs && rhs;
     }
   }
+
+  void VisitorImpl(
+      const platform::NPUPinnedPlace& cpu /* equals to cpu*/) const {
+    int num = in_.numel();
+    const bool* in_ptr = in_.data<bool>();
+    bool* out_ptr = out_->data<bool>();
+    for (int i = 0; i < num; ++i) {
+      bool lhs = !in_ptr[i];
+      bool rhs = !out_ptr[i];
+      out_ptr[i] = lhs && rhs;
+    }
+  }
 };
 
 void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 22c8e1c1665f1..15c478e531e9c 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -19,6 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/dlpack_tensor.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+#endif
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
@@ -166,8 +170,30 @@ void TensorFromVector(const std::vector<T>& src,
   // Since vector is on cpu, I think this function should be a "sync" operation,
   // so pass nullptr as stream to  memory::Copy().
   else if (platform::is_npu_place(dst_place)) {  // NOLINT
-    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 src_place, src_ptr, size, nullptr);
+    //  1. vector -> npu pinned tensor
+    Tensor npu_pinned_tensor(dst->type());
+    platform::NPUPinnedPlace npu_pinned_place;
+    auto npu_pinned_ptr =
+        npu_pinned_tensor.mutable_data<T>(dst->dims(), npu_pinned_place);
+    memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
+
+    //  2. async copy npu pinned tensor -> npu tensor
+    memory::Copy(
+        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+        npu_pinned_place, npu_pinned_ptr, size,
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+
+    //  3. record event
+    auto npu_pinned_allocator =
+        static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
+            paddle::memory::allocation::AllocatorFacade::Instance()
+                .GetAllocator(npu_pinned_place)
+                .get());
+    paddle::memory::allocation::Allocation* allocation =
+        npu_pinned_tensor.Holder().get();
+    npu_pinned_allocator->RecordEvent(
+        allocation,
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
   }
 #endif
 }
@@ -206,8 +232,31 @@ inline void TensorFromVector(const std::vector<bool>& src,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   else if (platform::is_npu_place(dst_place)) {  // NOLINT
-    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 src_place, src_ptr, size, nullptr);
+    //  1. vector -> npu pinned tensor
+    platform::NPUPinnedPlace npu_pinned_place;
+    Tensor npu_pinned_tensor;
+    npu_pinned_tensor.Resize(dst->dims());
+    auto npu_pinned_ptr =
+        npu_pinned_tensor.mutable_data(npu_pinned_place, dst->type());
+    memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
+
+    //  2. async copy npu pinned tensor -> npu tensor
+    memory::Copy(
+        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+        npu_pinned_place, npu_pinned_ptr, size,
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+
+    //  3. record event
+    auto npu_pinned_allocator =
+        static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
+            paddle::memory::allocation::AllocatorFacade::Instance()
+                .GetAllocator(npu_pinned_place)
+                .get());
+    paddle::memory::allocation::Allocation* allocation =
+        npu_pinned_tensor.Holder().get();
+    npu_pinned_allocator->RecordEvent(
+        allocation,
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
   }
 #endif
   delete[] array;
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 3ac36bd2e4a24..636760029fedc 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -26,8 +26,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/fleet/heter_wrapper.h"
-#include "paddle/fluid/framework/heter_service.h"
+#include "paddle/fluid/framework/fleet/heter_context.h"
+//#include "paddle/fluid/framework/fleet/heter_wrapper.h"
+#include "paddle/fluid/framework/heter_util.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/reader.h"
@@ -46,6 +47,10 @@ class PullDenseWorker;
 class Scope;
 class VarDesc;
 class DeviceWorker;
+class HeterWrapper;
+class HeterRequest;
+class HeterResponse;
+
 template <class T>
 class ChannelObject;
 
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index fd2bb6e5c9952..b4154737e0fbc 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -160,7 +160,8 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
   if (AmpOperators::Instance().GetMutableAllowOps()->count(op_type)) {
     for (auto& pair : new_ins) {
       // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
-      if ((op_type == "batch_norm" || op_type == "layer_norm") &&
+      if ((op_type == "batch_norm" || op_type == "layer_norm" ||
+           op_type == "sync_batch_norm") &&
           pair.first != "X") {
         continue;
       }
@@ -191,7 +192,8 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
     }
     for (auto& pair : new_ins) {
       // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
-      if ((op_type == "batch_norm" || op_type == "layer_norm") &&
+      if ((op_type == "batch_norm" || op_type == "layer_norm" ||
+           op_type == "sync_batch_norm") &&
           pair.first == "X" && dst_type == framework::proto::VarType::FP32) {
         continue;
       }
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 43546cf99c69f..6b9b411713329 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -132,6 +132,12 @@ class TensorAddFunctor : public boost::static_visitor<> {
   }
 #endif
 
+  void operator()(const platform::NPUPinnedPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
   // there is NO blas in CUDAPinnedPlace
   void operator()(const platform::CUDAPinnedPlace& place) {
     PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index e3dd0a2aa75b4..0f6676ed48f34 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -297,7 +297,7 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
       is_sparse_gradient_(is_sparse_gradient),
       parallel_ctx_(parallel_ctx),
       group_size_limits_(group_size_limits),
-      find_unused_vars_(find_unused_vars) {
+      find_unused_vars_each_step_(find_unused_vars) {
   VLOG(3) << "Start construct the Reducer ...";
   nrings_ = parallel_ctx->GetNRings();
   nranks_ = parallel_ctx->GetNRanks();
@@ -457,42 +457,8 @@ void Reducer::PrepareDeps(const std::unordered_set<GradOpNode *> &init_nodes) {
   }
 }
 
-// After each batch is calculated, the counter of each group(group.pending_)
-// and allreudce sequence counter(next_group_) will be cleaned up again.
-void Reducer::PrepareForBackward(
+void Reducer::TraverseBackwardGraph(
     const std::vector<std::shared_ptr<imperative::VarBase>> &outputs) {
-  VLOG(3) << "after forward, then reset count for backward.";
-  next_group_ = 0;
-  std::for_each(groups_.begin(), groups_.end(), [](Group &group) {
-    group.pending_ = group.variable_indices_.size();
-    group.sparse_contents_ = nullptr;
-  });
-
-  // reinitialize vars_marked_ready_ for next iteration
-  vars_marked_ready_.clear();
-  vars_marked_ready_.resize(vars_.size(), false);
-
-  PADDLE_ENFORCE_EQ(
-      groups_need_finalize_, false,
-      platform::errors::PreconditionNotMet(
-          "A serious error has occurred here. There may be several reasons: "
-          "1) Please note that all forward outputs derived from the module "
-          "parameters must participate in the calculation of losses and "
-          "subsequent gradient calculations. If not, the wrapper will hang, "
-          "waiting for autograd to generate gradients for these parameters. "
-          "you can use detach or stop_gradient to make the unused parameters "
-          "detached from the autograd graph. "
-          "2) Used multiple forwards and one backward. You may be able to wrap "
-          "multiple forwards in a model."));
-
-  // The first var to trigger the unused parameter
-  has_marked_unused_vars_ = false;
-  unused_vars_.clear();
-
-  if (!find_unused_vars_) {
-    return;
-  }
-
   node_deps_.clear();
   std::queue<std::shared_ptr<GradOpNode>> q;
   std::unordered_set<VariableWrapper *> var_visited;
@@ -554,8 +520,50 @@ void Reducer::PrepareForBackward(
               << "] is not used";
     }
   }
+}
 
-  if (unused_vars_.empty()) {
+// After each batch is calculated, the counter of each group(group.pending_)
+// and allreudce sequence counter(next_group_) will be cleaned up again.
+void Reducer::PrepareForBackward(
+    const std::vector<std::shared_ptr<imperative::VarBase>> &outputs) {
+  VLOG(3) << "after forward, then reset count for backward.";
+  next_group_ = 0;
+  std::for_each(groups_.begin(), groups_.end(), [](Group &group) {
+    group.pending_ = group.variable_indices_.size();
+    group.sparse_contents_ = nullptr;
+  });
+
+  // reinitialize vars_marked_ready_ for next iteration
+  vars_marked_ready_.clear();
+  vars_marked_ready_.resize(vars_.size(), false);
+
+  PADDLE_ENFORCE_EQ(
+      groups_need_finalize_, false,
+      platform::errors::PreconditionNotMet(
+          "A serious error has occurred here. Please "
+          "set find_unused_parameters=True to traverse backward graph "
+          "in each step to prepare reduce in advance. If you have "
+          "set, There may be several reasons for this error: "
+          "1) Please note that all forward outputs derived from the module "
+          "parameters must participate in the calculation of losses and "
+          "subsequent gradient calculations. If not, the wrapper will hang, "
+          "waiting for autograd to generate gradients for these parameters. "
+          "you can use detach or stop_gradient to make the unused parameters "
+          "detached from the autograd graph. "
+          "2) Used multiple forwards and one backward. You may be able to wrap "
+          "multiple forwards in a model."));
+
+  // The first var to trigger the unused parameter
+  has_marked_unused_vars_ = false;
+
+  if (find_unused_vars_once_ || find_unused_vars_each_step_) {
+    unused_vars_.clear();
+    TraverseBackwardGraph(outputs);
+    // only check once in first step
+    find_unused_vars_once_ = false;
+  }
+
+  if (find_unused_vars_each_step_ && unused_vars_.empty()) {
     LOG_FIRST_N(WARNING, 1)
         << "All parameters are involved in the backward pass. "
            "It is recommended to set find_unused_parameters to False "
@@ -564,7 +572,9 @@ void Reducer::PrepareForBackward(
            "will occur. Please make it clear that in the subsequent "
            "training, there will be no parameters that are not used "
            "in the backward pass, and then set find_unused_parameters";
-  } else if (unused_vars_.size() == vars_.size()) {
+  }
+
+  if (unused_vars_.size() == vars_.size()) {
     LOG_FIRST_N(WARNING, 1)
         << "There is no parameter in the device involved "
            "in the backward calculation. If there are "
@@ -595,13 +605,13 @@ void Reducer::AddDistHook(size_t var_index) {
 
   local_used_vars_[var_index] = 1;
 
-  // rebuild group when find_unused_vars_ is false
+  // rebuild group when find_unused_vars_each_step_ is false
   if (NeedRebuildGroup()) {
     rebuild_vars_.push_back(vars_[var_index]);
     rebuild_var_indices_.push_back(var_index);
   }
 
-  if (!has_marked_unused_vars_ && find_unused_vars_) {
+  if (!has_marked_unused_vars_) {
     has_marked_unused_vars_ = true;
     for (const auto &unused_index : unused_vars_) {
       MarkVarReady(unused_index, false);
@@ -622,7 +632,9 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
   if (vars_marked_ready_[var_index]) {
     auto error_info = string::Sprintf(
         "Error happened, when parameter[%d][%s] has been ready before. "
-        "There may be several reasons for this error: "
+        "Please set find_unused_parameters=True to traverse backward graph "
+        "in each step to prepare reduce in advance. If you have set, "
+        "there may be several reasons for this error: "
         "1) In multiple reentrant backward phase, some parameters are reused."
         "2) Using model parameters outside of forward function. Please "
         "make sure that model parameters are not shared in concurrent "
@@ -690,10 +702,16 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
     }
   } else {
     // process sparse group
-    PADDLE_ENFORCE_EQ(HasGrad(var_index), true,
-                      platform::errors::PreconditionNotMet(
-                          "The sparse parameter[%d][%s] must have a gradient",
-                          var_index, vars_[var_index]->Name()));
+    PADDLE_ENFORCE_EQ(
+        HasGrad(var_index), true,
+        platform::errors::PreconditionNotMet(
+            "The sparse parameter[%d][%s] should have gradient. "
+            "Currently, DataParallel does not support sparse "
+            "parameters without generating gradients during training. "
+            "For example, if is_sparese=True is used in Embedding, "
+            "the current step of this parameter cannot generate gradient "
+            "because of stop_gradient/detatch, where error will occur.",
+            var_index, vars_[var_index]->Name()));
     auto var_base = vars_[var_index]->GradVarBase();
     // need to check tensor type
     PADDLE_ENFORCE_EQ(
@@ -943,7 +961,7 @@ void Reducer::FinalizeBackward() {
     InitializeGroups(group_indices_);
   }
 
-  if (find_unused_vars_) {
+  if (find_unused_vars_each_step_) {
 // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     ProcessUnusedDenseVars();
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index 0d613dbea8963..8392ab2c704d5 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -162,13 +162,16 @@ class Reducer {
   std::vector<std::vector<size_t>> RebuildGruops();
 
   inline bool NeedRebuildGroup() {
-    return !has_rebuilt_group_ && !find_unused_vars_;
+    return !has_rebuilt_group_ && !find_unused_vars_each_step_;
   }
 
   void ProcessUnusedDenseVars();
 
   bool HasGrad(size_t var_index);
 
+  void TraverseBackwardGraph(
+      const std::vector<std::shared_ptr<imperative::VarBase>>& outputs);
+
  private:
   std::vector<std::shared_ptr<imperative::VarBase>> vars_;
   std::vector<std::vector<size_t>> group_indices_;
@@ -195,7 +198,8 @@ class Reducer {
   std::unordered_map<VariableWrapper*, size_t> var_index_map_;
   std::vector<size_t> unused_vars_;
   bool has_marked_unused_vars_{false};
-  bool find_unused_vars_{false};
+  bool find_unused_vars_each_step_{false};
+  bool find_unused_vars_once_{true};
   bool groups_need_finalize_{false};
 #ifdef PADDLE_WITH_XPU_BKCL
   // comm_pool_ is used for scheduling allreduce in multi Kunlun cards training.
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 82c95ba2c9571..c7d947c58039e 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -71,7 +71,7 @@ elseif (WIN32)
   cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
-if(WITH_TESTING)
+if(WITH_TESTING AND TEST test_api_impl)
     if(NOT APPLE)
         set_tests_properties(test_api_impl PROPERTIES TIMEOUT 120)
     endif()
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 89c8c7902bac9..1ec692d3d1df6 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -650,13 +650,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
         gflags.push_back("--cudnn_deterministic=True");
       }
 
-      if (config.thread_local_stream_enabled()) {
-        gflags.push_back("--allocator_strategy=thread_local");
-        process_level_allocator_enabled = false;
-      } else {
-        process_level_allocator_enabled = true;
-      }
-
 // TODO(wilber): jetson tx2 may fail to run the model due to insufficient memory
 // under the native_best_fit strategy. Modify the default allocation strategy to
 // auto_growth. todo, find a more appropriate way to solve the problem.
@@ -664,6 +657,15 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
       gflags.push_back("--allocator_strategy=auto_growth");
 #endif
 
+      // TODO(Shixiaowei02): Add a mandatory scheme to use the thread local
+      // allocator when multi-stream is enabled.
+      if (config.thread_local_stream_enabled()) {
+        gflags.push_back("--allocator_strategy=thread_local");
+        process_level_allocator_enabled = false;
+      } else {
+        process_level_allocator_enabled = true;
+      }
+
       if (framework::InitGflags(gflags)) {
         VLOG(3) << "The following gpu analysis configurations only take effect "
                    "for the first predictor: ";
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index f7dbfd39cd26e..43306b79fabf6 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -161,8 +162,24 @@ void Tensor::CopyToCpu(T *data) {
   auto *t_data = tensor->data<T>();
   auto t_place = tensor->place();
 
+  paddle::framework::Tensor out;
+  auto mem_allocation = std::make_shared<paddle::memory::Allocation>(
+      static_cast<void *>(data), ele_num * sizeof(T),
+      paddle::platform::CPUPlace());
+  out.ResetHolder(mem_allocation);
+
   if (paddle::platform::is_cpu_place(t_place)) {
+#ifdef PADDLE_WITH_MKLDNN
+    if (tensor->layout() == paddle::framework::DataLayout::kMKLDNN)
+      paddle::framework::innerTransDataLayoutFromMKLDNN(
+          tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls()
+                                .get_cur_paddle_data_layout(),
+          *tensor, &out, paddle::platform::CPUPlace(), true);
+    else
+      std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+#else
     std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+#endif
   } else if (place_ == PlaceType::kGPU) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     paddle::platform::DeviceContextPool &pool =
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 19d79510547ec..47f5cc97d39cd 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -66,6 +66,25 @@ class ElementwiseWeightOpConverter : public OpConverter {
                                            0};
       TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                            0};
+
+      nvinfer1::IShuffleLayer* expand_layer = nullptr;
+      nvinfer1::IShuffleLayer* squeeze_layer = nullptr;
+      int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0;
+      auto input_dim = X->getDimensions();
+      if (input_dim.nbDims < 3 + dynamic_shape_offset) {
+        nvinfer1::Dims expand_shape;
+        expand_shape.nbDims = 3 + dynamic_shape_offset;
+        for (int i = 0; i < expand_shape.nbDims; i++) {
+          if (i < input_dim.nbDims) {
+            expand_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i];
+          } else {
+            expand_shape.d[i] = 1;
+          }
+        }
+        expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+        expand_layer->setReshapeDimensions(expand_shape);
+        X = expand_layer->getOutput(0);
+      }
       if (op_type_ == "add") {
         nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
             engine_, Scale, *X, scale_mode, shift_weights.get(),
@@ -77,7 +96,17 @@ class ElementwiseWeightOpConverter : public OpConverter {
             shift_weights.get(), power_weights.get());
         layer = scale_layer;
       }
-
+      if (input_dim.nbDims < 3 + dynamic_shape_offset) {
+        nvinfer1::Dims squeeze_shape;
+        squeeze_shape.nbDims = input_dim.nbDims;
+        for (int i = 0; i < squeeze_shape.nbDims; i++) {
+          squeeze_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i];
+        }
+        squeeze_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
+        squeeze_layer->setReshapeDimensions(squeeze_shape);
+        layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+      }
       auto output_name = op_desc.Output("Out")[0];
       RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name},
                                test_mode);
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 6167e68df2b67..d2dcd4d11bfc8 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -37,7 +37,7 @@ class FcOpConverter : public OpConverter {
                   const framework::Scope& scope, bool test_mode) override {
     VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias";
     framework::OpDesc op_desc(op, nullptr);
-
+    auto output_name = op_desc.Output("Out").front();
     auto input_names = op_desc.InputNames();
     bool with_bias = input_names.size() >= 3;
     std::string w_name = "Y";
@@ -54,7 +54,7 @@ class FcOpConverter : public OpConverter {
         Y_v, platform::errors::NotFound(
                  "Can not find %s presistale var of fc in scope.", w_name));
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-    const int x_num_col_dims =
+    int x_num_col_dims =
         op_desc.HasAttr("x_num_col_dims")
             ? BOOST_GET_CONST(int, op_desc.GetAttr("x_num_col_dims"))
             : (op_desc.HasAttr("in_num_col_dims")
@@ -106,8 +106,8 @@ class FcOpConverter : public OpConverter {
     auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output,
                          TensorRTEngine::Weight& weight,
                          TensorRTEngine::Weight& bias) {
-      nvinfer1::ILayer* fc_layer = nullptr;
       if (enable_int8) {
+        // add conv layer
         PADDLE_ENFORCE_EQ(
             op_desc.HasAttr("out_threshold"), true,
             platform::errors::InvalidArgument(
@@ -115,22 +115,46 @@ class FcOpConverter : public OpConverter {
         float out_scale =
             BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
         nvinfer1::DimsHW nv_ksize(1, 1);
-        fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
-                                        nv_ksize, weight.get(), bias.get());
-        engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
-      } else {
-        fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs,
-                                        n_output, weight.get(), bias.get());
-      }
-
-      auto output_name = op_desc.Output("Out").front();
-      if (activation_type == "relu") {
-        nvinfer1::IActivationLayer* relu_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Activation, *(fc_layer->getOutput(0)),
-                                 nvinfer1::ActivationType::kRELU);
-        RreplenishLayerAndOutput(relu_layer, "fc", {output_name}, test_mode);
+        auto* fc_layer_int8 =
+            TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
+                                 nv_ksize, weight.get(), bias.get());
+        engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0), out_scale);
+        if (activation_type == "relu") {
+          nvinfer1::IActivationLayer* relu_layer_int8 = TRT_ENGINE_ADD_LAYER(
+              engine_, Activation, *(fc_layer_int8->getOutput(0)),
+              nvinfer1::ActivationType::kRELU);
+          RreplenishLayerAndOutput(relu_layer_int8, "relu_after_fc_shuffle",
+                                   {output_name}, test_mode);
+        } else {
+          RreplenishLayerAndOutput(fc_layer_int8, "shuffle_after_fc",
+                                   {output_name}, test_mode);
+        }
       } else {
-        RreplenishLayerAndOutput(fc_layer, "fc", {output_name}, test_mode);
+        // add fc layer
+        auto* fc_layer_before =
+            TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, n_output,
+                                 weight.get(), bias.get());
+        fc_layer_before->setName(
+            ("fc_layer_before(Output: " + output_name + ")").c_str());
+        // add shuffle after fc
+        nvinfer1::Dims reshape_after_fc_dim;
+        reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
+        for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) {
+          reshape_after_fc_dim.d[i] = 0;
+        }
+        auto* fc_layer_float = TRT_ENGINE_ADD_LAYER(
+            engine_, Shuffle, *fc_layer_before->getOutput(0));
+        fc_layer_float->setReshapeDimensions(reshape_after_fc_dim);
+        if (activation_type == "relu") {
+          nvinfer1::IActivationLayer* relu_layer_float = TRT_ENGINE_ADD_LAYER(
+              engine_, Activation, *(fc_layer_float->getOutput(0)),
+              nvinfer1::ActivationType::kRELU);
+          RreplenishLayerAndOutput(relu_layer_float, "relu_after_fc_shuffle",
+                                   {output_name}, test_mode);
+        } else {
+          RreplenishLayerAndOutput(fc_layer_float, "shuffle_after_fc",
+                                   {output_name}, test_mode);
+        }
       }
     };
 
@@ -157,153 +181,43 @@ class FcOpConverter : public OpConverter {
                                 static_cast<void*>(bias_data),
                                 static_cast<size_t>(bias_num)};
 
-    if (engine_->with_dynamic_shape()) {
-      // not NCHW layout, but NLP layout with added 'x 1 x 1'
-      auto x_dim = X->getDimensions();
-      if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 &&
-          x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 2) {
-        // fc which is just after self attention
-        regist_fc(X, n_output, weight, bias);
-        return;
-      }
-      PADDLE_ENFORCE_LE(
-          x_dim.nbDims - x_num_col_dims, 3,
-          platform::errors::InvalidArgument(
-              "Params and input dims mismatch. Paddle-TRT FC "
-              "converter expects x_dim.nbDims - x_num_col_dims <= 3, but "
-              "x_dim.nbDims = %d, x_num_col_dims = %d.",
-              x_dim.nbDims, x_num_col_dims));
-      auto output_name = op_desc.Output("Out").front();
-      // add shuffle before fc
-      nvinfer1::Dims reshape_before_fc_dim;
-      // padding shape "x 1 x 1"
-      int padding_length = 3 - (x_dim.nbDims - x_num_col_dims);
-      reshape_before_fc_dim.nbDims = x_dim.nbDims + padding_length;
-      int cur_dim_index = reshape_before_fc_dim.nbDims - 1;
-      while (padding_length-- > 0) {
-        reshape_before_fc_dim.d[cur_dim_index--] = 1;
-      }
-      while (cur_dim_index >= 0) {
-        reshape_before_fc_dim.d[cur_dim_index--] = 0;
-      }
-
-      auto* reshape_before_fc_layer =
-          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
-      reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
-      reshape_before_fc_layer->setName(
-          ("shuffle_before_fc(Output: " + output_name + ")").c_str());
-
-      // add fc layer
-      auto* fc_layer = TRT_ENGINE_ADD_LAYER(
-          engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0),
-          n_output, weight.get(), bias.get());
-      fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str());
-
-      // add shuffle after fc
-      nvinfer1::Dims reshape_after_fc_dim;
-      reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
-      for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) {
-        reshape_after_fc_dim.d[i] = 0;
-      }
-
-      auto* reshape_after_fc_layer =
-          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
-      reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
-
-      if (activation_type == "relu") {
-        reshape_after_fc_layer->setName(
-            ("shuffle_after_fc(Output: " + output_name + ")").c_str());
-        nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, Activation, *(reshape_after_fc_layer->getOutput(0)),
-            nvinfer1::ActivationType::kRELU);
-        RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle",
-                                 {output_name}, test_mode);
-      } else {
-        RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc",
-                                 {output_name}, test_mode);
-      }
-      return;
+    auto x_dim = X->getDimensions();
+    // Running the TRT Static Shape mode: x_num_col_dims-1
+    if (!engine_->with_dynamic_shape()) {
+      x_num_col_dims--;
     }
-    // in order to handle situations in NLP models(input dims < 3,
-    // x_num_col_dims != 1, etc.), reshape input to perform FC correctly.
-    auto* reshape_itensor = X;
-    int input_dims = X->getDimensions().nbDims;
-    auto input_d = X->getDimensions().d;
-    int reshape_dim3[3] = {0};
-    int reshape_dim4[4] = {0};
-    PADDLE_ENFORCE_LE(x_num_col_dims, input_dims,
-                      platform::errors::InvalidArgument(
-                          "Params and input dims mismatch. Paddle-TRT FC "
-                          "converter expects x_num_col_dims <= input dims"));
-    if (x_num_col_dims == 1) {
-      if (input_dims == 4) {
-        PADDLE_ENFORCE_EQ(
-            input_d[3], 1,
-            platform::errors::InvalidArgument(
-                "Invalid dimensions. When x_num_col_dims equals to 1 and input "
-                "dims equals to 4, the last dim of input must be 1, but got %d",
-                input_d[3]));
-      }
-      if (enable_int8) {
-        reshape_dim3[0] = 1;
-        for (int i = 0; i < 3; i++) {
-          reshape_dim3[0] *= input_d[i];
-          if (i > 0) {
-            reshape_dim3[i] = 1;
-          }
-        }
-      } else {
-        for (int i = 0; i < 3; i++) {
-          if (i < input_dims) {
-            reshape_dim3[i] = input_d[i];
-          } else {
-            reshape_dim3[i] = 1;
-          }
-        }
-      }
-
-      nvinfer1::Dims3 reshape_dim(reshape_dim3[0], reshape_dim3[1],
-                                  reshape_dim3[2]);
-      auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
-      reshape_layer->setReshapeDimensions(reshape_dim);
-      reshape_itensor = reshape_layer->getOutput(0);
-      if (enable_int8) {
-        engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
-      }
-    } else {
-      PADDLE_ENFORCE_NE(input_dims, 1,
-                        platform::errors::InvalidArgument(
-                            "Invalid dimensions. When x_num_col_dims equals to "
-                            "2, input_dims should not be 1"));
-
-      if (enable_int8) {
-        for (int i = 0; i < 4; i++) {
-          if (i == 0) {
-            reshape_dim4[i] = input_d[i];
-          } else {
-            reshape_dim4[i] = 1;
-            if (i < input_dims) {
-              reshape_dim4[1] *= input_d[i];
-            }
-          }
-        }
+    PADDLE_ENFORCE_GT(
+        x_dim.nbDims, x_num_col_dims,
+        platform::errors::InvalidArgument(
+            "Params and input dims mismatch. Paddle-TRT FC "
+            "converter expects x_dim.nbDims > x_num_col_dims, but "
+            "x_dim.nbDims : %d, x_num_col_dims : %d.",
+            x_dim.nbDims, x_num_col_dims));
+    // add shuffle before fc
+    nvinfer1::Dims reshape_before_fc_dim;
+    reshape_before_fc_dim.nbDims = x_num_col_dims + 3;
+    // padding shape "* x q x 1 x 1"
+    for (int i = 0; i < reshape_before_fc_dim.nbDims; i++) {
+      reshape_before_fc_dim.d[i] = 1;
+    }
+    for (int i = 0; i < x_dim.nbDims; i++) {
+      if (i < x_num_col_dims) {
+        reshape_before_fc_dim.d[i] = 0;
       } else {
-        for (int i = 0; i < 4; i++) {
-          if (i < input_dims) {
-            reshape_dim4[i] = input_d[i];
-          } else {
-            reshape_dim4[i] = 1;
-          }
+        if (x_dim.d[i] < 0) {
+          reshape_before_fc_dim.d[x_num_col_dims] = -1;
+          break;
         }
+        reshape_before_fc_dim.d[x_num_col_dims] *= x_dim.d[i];
       }
-      nvinfer1::Dims4 reshape_dim(reshape_dim4[0], reshape_dim4[1],
-                                  reshape_dim4[2], reshape_dim4[3]);
-      auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
-      reshape_layer->setReshapeDimensions(reshape_dim);
-      reshape_itensor = reshape_layer->getOutput(0);
-      if (enable_int8) {
-        engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
-      }
+    }
+    auto* reshape_before_fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+    reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
+    reshape_before_fc_layer->setName(
+        ("shuffle_before_fc(Output: " + output_name + ")").c_str());
+    auto* reshape_itensor = reshape_before_fc_layer->getOutput(0);
+    if (enable_int8) {
+      engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
     }
     regist_fc(reshape_itensor, n_output, weight, bias);
   }
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 48c7b7fdd0d79..54fc9492b7193 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -633,6 +633,20 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
     }
 
+    if (op_type == "fc") {
+      int x_num_col_dims =
+          desc.HasAttr("x_num_col_dims")
+              ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims"))
+              : (desc.HasAttr("in_num_col_dims")
+                     ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims"))
+                     : 1);
+      if (x_num_col_dims < 1) {
+        VLOG(3) << "converter expects x_num_col_dims >= 1, "
+                   "but x_num_col_dims = %d.";
+        return false;
+      }
+    }
+
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
   return false;
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index f74cd671d6dca..60479f806f366 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -242,10 +242,10 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz")
 inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
     EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
-if(NOT WIN32 AND NOT APPLE)
+if(NOT WIN32 AND NOT APPLE AND TEST test_analyzer_ernie_large)
     set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
 endif()
-if (WIN32)
+if (WIN32 AND TEST test_analyzer_ernie_large)
     set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 200)
 endif()
 
@@ -645,6 +645,10 @@ if(WITH_GPU)
         ARGS --infer_model=${RESNET50_MODEL_DIR})
 endif()
 
+if("$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+    return()
+endif()
+
 if(WITH_GPU AND TENSORRT_FOUND)
     set_tests_properties(trt_resnext_test PROPERTIES TIMEOUT 300)
     set_tests_properties(trt_quant_int8_yolov3_r50_test PROPERTIES TIMEOUT 300)
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 2ea047fa13c10..9a0637453f03f 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -29,6 +29,7 @@ endif()
 
 if (WITH_ASCEND_CL)
   cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info)
+  cc_library(npu_pinned_allocator SRCS npu_pinned_allocator.cc DEPS allocator npu_info)
 endif()
 
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
@@ -73,10 +74,15 @@ endif()
 
 list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator)
 
+if (WITH_ASCEND_CL)
+    list(APPEND AllocatorFacadeDeps npu_pinned_allocator)
+endif()
+
+
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
 cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy )
 
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
 if (WITH_TESTING)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 730efa5c64688..3a156f1fa3c4c 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -20,6 +20,9 @@
 #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+#endif
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
@@ -72,6 +75,7 @@ class AllocatorFacadePrivate {
         for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
         }
+        InitNaiveBestFitNPUPinnedAllocator();
 #endif
         break;
       }
@@ -195,6 +199,12 @@ class AllocatorFacadePrivate {
   void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
     allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
   }
+
+  void InitNaiveBestFitNPUPinnedAllocator() {
+    allocators_[platform::NPUPinnedPlace()] =
+        std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
+  }
+
 #endif
 
   class ZeroSizeAllocator : public Allocator {
@@ -294,6 +304,11 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
       ->Release(place);
 }
 
+const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
+    const platform::Place& place) {
+  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index fa906fbf5ce8f..7f6ad561aa931 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -15,11 +15,17 @@
 #pragma once
 #include <memory>
 #include "paddle/fluid/memory/allocation/allocator.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+#endif
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace memory {
 namespace allocation {
+#ifdef PADDLE_WITH_ASCEND_CL
+using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
+#endif
 
 // Allocator Facade is the interface exposed to other modules.
 // All the configuration or dirty code under development should
@@ -46,6 +52,7 @@ class AllocatorFacade {
 
   // Release unused memory pool.
   uint64_t Release(const platform::Place& place);
+  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
 
   // TODO(yy): Allocate a Copy-On-Write allocation?
  private:
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 3e88d61783c9e..bc72b4b20d061 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -287,6 +287,21 @@ class NPUBuddyAllocatorList {
 BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
   return NPUBuddyAllocatorList::Instance()->Get(npu_id);
 }
+
+BuddyAllocator *GetNPUPinnedBuddyAllocator() {
+  static std::once_flag init_flag;
+  static BuddyAllocator *ba = nullptr;
+
+  std::call_once(init_flag, []() {
+    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                new detail::NPUPinnedAllocator),
+                            platform::NPUPinnedMinChunkSize(),
+                            platform::NPUPinnedMaxChunkSize());
+  });
+
+  return ba;
+}
+
 #endif
 
 template <>
@@ -351,6 +366,59 @@ uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
 #endif
 }
 
+template <>
+size_t Used<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUPinnedBuddyAllocator()->Used();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPinnedPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
+                                      size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  auto *buddy_allocator = GetNPUPinnedBuddyAllocator();
+  void *ptr = buddy_allocator->Alloc(size);
+
+  if (ptr == nullptr) {
+    LOG(WARNING) << "aclrtMallocHost Cannot allocate " << size
+                 << " bytes in NPUPinnedPlace";
+  }
+  if (FLAGS_init_allocated_mem) {
+    memset(ptr, 0xEF, size);
+  }
+  return ptr;
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPinnedPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
+                                    void *p, size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  GetNPUPinnedBuddyAllocator()->Free(p);
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPinnedPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+uint64_t Release<platform::NPUPinnedPlace>(
+    const platform::NPUPinnedPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUPinnedBuddyAllocator()->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPinnedPlace' is not supported in CPU only device."));
+#endif
+}
+
 // For CUDA
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class GPUBuddyAllocatorList {
diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
new file mode 100644
index 0000000000000..507a8589d94dd
--- /dev/null
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+void NPUPinnedAllocator::ProcessEventsAndFree() {
+  for (auto it = npu_events_.begin(); it != npu_events_.end();) {
+    aclrtEvent event = it->second;
+    aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status));
+
+    if (status == ACL_EVENT_STATUS_COMPLETE) {
+      Allocation *allocation = it->first;
+      void *ptr = allocation->ptr();
+      free(ptr);
+      npu_events_.erase(it++);
+      delete allocation;
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
+    } else {
+      ++it;
+    }
+  }
+}
+
+Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
+  ProcessEventsAndFree();
+  void *ptr;
+  int error = posix_memalign(&ptr, kAlignment, size);
+  PADDLE_ENFORCE_EQ(
+      error, 0,
+      platform::errors::ResourceExhausted(
+          "Fail to alloc memory of %ld size, error code is %d.", size, error));
+  return new Allocation(ptr, size, platform::NPUPinnedPlace());
+}
+
+void NPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+  void *ptr = allocation->ptr();
+  auto iter = npu_events_.find(allocation);
+  aclrtEvent event = iter->second;
+  aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status));
+  if (status == ACL_EVENT_STATUS_COMPLETE) {
+    free(ptr);
+    npu_events_.erase(allocation);
+    delete allocation;
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
+  }
+  return;
+}
+
+uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) {
+  return static_cast<uint64_t>(0);
+}
+
+void NPUPinnedAllocator::RecordEvent(Allocation *allocation,
+                                     aclrtStream stream) {
+  aclrtEvent event = nullptr;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event));
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(event, stream));
+  npu_events_.insert({allocation, event});
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.h b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
new file mode 100644
index 0000000000000..4c856b931ee2c
--- /dev/null
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_map>
+
+#include "acl/acl.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class NPUPinnedAllocator : public Allocator {
+ public:
+  bool IsAllocThreadSafe() const override { return true; }
+  void ProcessEventsAndFree();
+  void RecordEvent(Allocation *allocation, aclrtStream stream);
+  constexpr static size_t kAlignment = 4096UL;
+
+ protected:
+  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(Allocation *allocation) override;
+  uint64_t ReleaseImpl(const platform::Place &place) override;
+
+ private:
+  std::unordered_map<Allocation *, aclrtEvent> npu_events_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 0d7065d8bfba0..d6dc303ebc789 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -310,6 +310,60 @@ void NPUAllocator::Free(void* p, size_t size, size_t index) {
 }
 
 bool NPUAllocator::UseGpu() const { return true; }
+
+void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
+  if (size <= 0) return nullptr;
+
+  size_t usable =
+      paddle::platform::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;
+
+  if (size > usable) {
+    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
+                 << " MB pinned memory."
+                 << ", available " << usable / 1024.0 / 1024.0 << " MB";
+    return nullptr;
+  }
+
+  void* p;
+  // PINNED memory is visible to all NPU contexts.
+  auto result = aclrtMallocHost(&p, size);
+
+  if (result == ACL_ERROR_NONE) {
+    *index = 1;  // PINNED memory
+    npu_pinnd_alloc_size_ += size;
+    return p;
+  } else {
+    LOG(WARNING) << "aclrtMallocHost failed.";
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
+void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
+  aclError err;
+  PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument(
+                                  "The index should be 1, but got %d", index));
+
+  PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size,
+                    platform::errors::InvalidArgument(
+                        "The size of memory (%d) to free exceeds the size of "
+                        "allocated npu pinned memory (%d)",
+                        size, npu_pinnd_alloc_size_));
+  npu_pinnd_alloc_size_ -= size;
+  err = aclrtFreeHost(p);
+
+  if (err != ACL_ERROR_NONE) {
+    PADDLE_ENFORCE_EQ(
+        err, 0,
+        platform::errors::Fatal(
+            "aclrtFreeHost failed in NPUPinnedAllocator, error code is %d",
+            err));
+  }
+}
+
+bool NPUPinnedAllocator::UseGpu() const { return false; }
+
 #endif
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index 26711ae4070f5..92042f0bbae9f 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -80,6 +80,16 @@ class NPUAllocator : public SystemAllocator {
   size_t npu_alloc_size_ = 0;
   int npu_id_;
 };
+
+class NPUPinnedAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t* index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t npu_pinnd_alloc_size_ = 0;
+};
 #endif
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 730d49e8acd93..a925957e1af10 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -245,7 +245,7 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
 
-    platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU");
+    platform::RecordEvent record_event("NpuMemcpySync:NPU->CPU");
     platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
   }
 }
@@ -294,6 +294,86 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
     }
   }
 }
+
+template <>
+void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
+    platform::CPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
+    const void* src, size_t num) {
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place;
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
+    platform::NPUPinnedPlace dst_place, void* dst, platform::CPUPlace src_place,
+    const void* src, size_t num) {
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place;
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
+    platform::NPUPinnedPlace dst_place, void* dst,
+    platform::NPUPinnedPlace src_place, const void* src, size_t num) {
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place;
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
+    platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place,
+    const void* src, size_t num, aclrtStream stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  platform::SetNPUDeviceId(src_place.device);
+
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by thream(" << stream << ")";
+
+  if (stream) {
+    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned");
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
+  } else {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
+
+    platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned");
+    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
+  }
+}
+
+template <>
+void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
+    platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
+    const void* src, size_t num, aclrtStream stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  platform::SetNPUDeviceId(dst_place.device);
+
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by thream(" << stream << ")";
+
+  if (stream) {
+    platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU");
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
+  } else {
+    // On NPU, async operation after sync operation is ok, while sync operation
+    // after async is not ok, since the async operation may not done.
+    // So, its needed to do wait before sync operation.
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
+
+    platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU");
+    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
+  }
+}
+
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index 5c431ce77dc76..796425a132b00 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -164,9 +164,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::AbsKernel<paddle::platform::CPUDeviceContext, int>,
     ops::AbsKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::AbsKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex64>,
+                   paddle::platform::complex<float>>,
     ops::AbsKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex128>);
+                   paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     abs_grad, ops::AbsGradKernel<paddle::platform::CPUDeviceContext, float>,
@@ -174,9 +174,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::AbsGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::AbsGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::AbsGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex64>,
+                       paddle::platform::complex<float>>,
     ops::AbsGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex128>);
+                       paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     abs_grad_grad,
@@ -187,6 +187,6 @@ REGISTER_OP_CPU_KERNEL(
     ops::AbsDoubleGradKernel<paddle::platform::CPUDeviceContext,
                              paddle::platform::float16>,
     ops::AbsDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex64>,
+                             paddle::platform::complex<float>>,
     ops::AbsDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex128>);
+                             paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu
index e373d628f6cbd..d03de7a45628a 100644
--- a/paddle/fluid/operators/abs_op.cu
+++ b/paddle/fluid/operators/abs_op.cu
@@ -13,44 +13,80 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/abs_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename T, typename Enable = void>
+struct CudaAbsFunctor;
+
+template <typename T>
+struct CudaAbsFunctor<T, math::Complex<T, math::Real<T>>> {
+  __device__ __forceinline__ math::Real<T> operator()(const T* args) const {
+    return abs(args[0]);
+  }
+};
+
+template <typename T>
+struct CudaAbsFunctor<T, math::NoComplex<T, math::Real<T>>> {
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return std::abs(args[0]);
+  }
+};
+
+template <typename T>
+class AbsKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    out->mutable_data<math::Real<T>>(context.GetPlace());
+
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    std::vector<const framework::Tensor*> ins = {x};
+    std::vector<framework::Tensor*> outs = {out};
+    auto functor = CudaAbsFunctor<T>();
+    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T,
+                                        math::Real<T>>(dev_ctx, ins, &outs,
+                                                       functor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
 REGISTER_OP_CUDA_KERNEL(
-    abs, ops::AbsKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::float16>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex64>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex128>);
+    abs, ops::AbsKernel<plat::CUDADeviceContext, float>,
+    ops::AbsKernel<plat::CUDADeviceContext, double>,
+    ops::AbsKernel<plat::CUDADeviceContext, int>,
+    ops::AbsKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AbsKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::AbsKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::AbsKernel<plat::CUDADeviceContext, plat::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
-    abs_grad, ops::AbsGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex64>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex128>);
+    abs_grad, ops::AbsGradKernel<plat::CUDADeviceContext, float>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, double>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, int>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, plat::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
-    abs_grad_grad,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::float16>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::complex64>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::complex128>);
+    abs_grad_grad, ops::AbsDoubleGradKernel<plat::CUDADeviceContext, float>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, double>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, int>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::complex<double>>);
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 836c5fa06f6df..87e65e8817798 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -13,6 +13,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/operators/math/math_cuda_utils.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 
 namespace paddle {
@@ -663,6 +664,640 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
+template <typename T>
+struct CudaLog1pFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // log1p(x) = log(1 + x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(log(one + x));
+  }
+};
+
+template <typename T>
+struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / (one + args[1]);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaLog2Functor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // log2(x) = log2(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(log2(x));
+  }
+};
+
+template <typename T>
+struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  T log_two = static_cast<T>(log(static_cast<MPType>(2.0f)));
+
+  // dx = dout / (x * log(2))
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / (args[1] * log_two);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaLog10Functor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // log10(x) = log10(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(log10(x));
+  }
+};
+
+template <typename T>
+struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  T log_ten = static_cast<T>(log(static_cast<MPType>(10.0f)));
+
+  // dx = dout / (x * log(10))
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / (args[1] * log_ten);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaBReluFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  // brelu(x) = min(max(x, t_min), t_max)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[0];
+    T t_min_cast = static_cast<T>(t_min);
+    T t_max_cast = static_cast<T>(t_max);
+    T temp_max = x > t_min_cast ? x : t_min_cast;
+    T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast;
+    return temp_min;
+  }
+};
+
+template <typename T>
+struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float t_min;
+  float t_max;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  // dx = (x > t_min && x < t_max) ? dout : 0
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T dout = args[0];
+    T x = args[1];
+    T t_min_cast = static_cast<T>(t_min);
+    T t_max_cast = static_cast<T>(t_max);
+    return (x > t_min_cast && x < t_max_cast) ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // soft_relu(x) = log(1 + exp(max(min(x, threshold), -threshold)))
+  // Inputs: args[0], the input x
+  // threshold should not be negative
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    MPType t = static_cast<MPType>(threshold);
+    MPType temp_min = x < t ? x : t;
+    MPType temp_max = temp_min > -t ? temp_min : -t;
+    return static_cast<T>(log(one + exp(temp_max)));
+  }
+};
+
+template <typename T>
+struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = (out > -threshold && out < threshold) ? dout * (1 - exp(-out)) : 0
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  // threshold should not be negative
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType out = static_cast<MPType>(args[1]);
+    MPType t = static_cast<MPType>(threshold);
+    return (out > -t && out < t) ? static_cast<T>(dout * (one - exp(-out)))
+                                 : static_cast<T>(0.0f);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <typename T>
+struct CudaSTanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  float scale_a;
+  float scale_b;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  // stanh(x) = b * tanh(a * x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    MPType a = static_cast<MPType>(scale_a);
+    MPType b = static_cast<MPType>(scale_b);
+    return static_cast<T>(b * tanh(a * x));
+  }
+};
+
+template <typename T>
+struct CudaSTanhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float scale_a;
+  float scale_b;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x))
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    MPType a = static_cast<MPType>(scale_a);
+    MPType b = static_cast<MPType>(scale_b);
+    MPType temp = tanh(a * x);
+    return static_cast<T>(dout * a * b * (one - temp * temp));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSoftplusFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float beta;
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    MPType b = static_cast<MPType>(beta);
+    MPType t = static_cast<MPType>(threshold);
+    MPType x_beta = x * beta;
+    return static_cast<T>(x_beta > t ? x : log(one + exp(x_beta)) / b);
+  }
+};
+
+template <typename T>
+struct CudaSoftplusGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float beta;
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x))
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    MPType b = static_cast<MPType>(beta);
+    MPType t = static_cast<MPType>(threshold);
+    MPType x_beta = x * beta;
+    return x_beta > t ? args[0] : static_cast<T>(dout / (one + exp(-x_beta)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSoftsignFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // softsign(x) = x / (1 + abs(x))
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / (one + abs(args[0]));
+  }
+};
+
+template <typename T>
+struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + abs(x))^2
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T temp = one + abs(args[1]);
+    return args[0] / (temp * temp);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaRelu6Functor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // relu6(x) = min(max(0, x), 6)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T t = static_cast<T>(threshold);
+    return args[0] <= zero ? zero : (args[0] < t ? args[0] : t);
+  }
+};
+
+template <typename T>
+struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = (out > 0 && out < t) ? dout : 0
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T t = static_cast<T>(threshold);
+    return (args[1] > zero && args[1] < t) ? args[0] : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <typename T>
+struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // tanhshrink(x) = x - tanh(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(x - tanh(x));
+  }
+};
+
+template <typename T>
+struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // dx = dout * tanh(x)^2
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout * tanh(x) * tanh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[0];
+    T t = static_cast<T>(threshold);
+    return (x > -t && x < t) ? zero : x;
+  }
+};
+
+template <typename T>
+struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = (x > -threshold && x < threshold) ? 0 : dout
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[1];
+    T t = static_cast<T>(threshold);
+    return (x > -t && x < t) ? zero : args[0];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  // hard_sigmoid(x) = 0, when x <= -3
+  //                   1, when x >= 3
+  //                   x * slope + offset, otherwise
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T temp = args[0] * static_cast<T>(slope) + static_cast<T>(offset);
+    T temp_max = temp > zero ? temp : zero;
+    T temp_min = temp_max < one ? temp_max : one;
+    return temp_min;
+  }
+};
+
+template <typename T>
+struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  // dx = (out > 0 && out < 1) ? dout * slope : 0
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T out = args[1];
+    return (out > zero && out < one) ? args[0] * static_cast<T>(slope) : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <typename T>
+struct CudaSwishFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float beta;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  // swish(x) = x / (1 + exp(-beta * x))
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    MPType b = static_cast<MPType>(beta);
+    return static_cast<T>(x / (one + exp(-b * x)));
+  }
+};
+
+template <typename T>
+struct CudaSwishGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float beta;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  // dx = dout * (1 + exp(-b * x) + b * x * exp(-b * x) / (1 + exp(-b * x))^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    MPType b = static_cast<MPType>(beta);
+    MPType temp1 = one / (one + exp(-b * x));
+    MPType out = x * temp1;
+    MPType temp2 = b * out;
+    MPType temp3 = temp1 * (one - temp2);
+    return static_cast<T>(dout * (temp2 + temp3));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // thresholded_relu(x) = x > threshold ? x : 0
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] > static_cast<T>(threshold) ? args[0] : zero;
+  }
+};
+
+template <typename T>
+struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = x > threshold ? dout : 0
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[1] > static_cast<T>(threshold) ? args[0] : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaHardSwishFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+  float scale;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}};
+  }
+
+  // hard_swish(x) = 0, when x <= -offset
+  //                 x , when x >= threshold - offset
+  //                 x * (x + offset) / scale, otherwise
+  // threshold = scale = 6, offset = 3 by default
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[0];
+    T t = static_cast<T>(threshold);
+    T temp = x + static_cast<T>(offset);
+    T temp_max = temp > zero ? temp : zero;
+    T temp_min = temp_max < t ? temp_max : t;
+    return temp_min * x / static_cast<T>(scale);
+  }
+};
+
+template <typename T>
+struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  T two = static_cast<T>(2.0f);
+  float threshold;
+  float scale;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}};
+  }
+
+  // dx = 0, when x <= -offset
+  //      dout , when x >= threshold - offset
+  //      dout * (2 * x / scale + offset / scale), otherwise
+  // threshold = scale = 6, offset = 3 by default
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[1];
+    T o = static_cast<T>(offset);
+    T s = static_cast<T>(scale);
+    T temp1 = static_cast<T>(x + o > zero);
+    T temp2 = static_cast<T>(x + o < static_cast<T>(threshold));
+    return args[0] * (temp1 * temp2 * (two * x + o) / s + one - temp2);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaELUFunctor : public BaseActivationFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT zero = static_cast<CT>(0.0f);
+  CT one = static_cast<CT>(1.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // elu(x) = max(0, x) + min(0, alpha * (exp(x) - 1))
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    CT temp = static_cast<CT>(alpha) * (exp(x) - one);
+    CT res = (x > zero ? x : zero) + (temp > zero ? zero : temp);
+    return static_cast<T>(res);
+  }
+};
+
+template <typename T>
+struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+  MPType one = static_cast<MPType>(1.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // dx = dout, if alpha > 0 and x > 0
+  // dx = dout * alpha * x.exp(), if alpha > 0 and x <= 0
+  // dx = dout * (1 + alpha * x.exp()), if alpha <= 0 and x > 0
+  // dx = 0, if alpha <= 0 and x <=0
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    MPType a = static_cast<MPType>(alpha);
+    MPType temp_a_pos = static_cast<MPType>(alpha > 0.0f);
+    MPType temp_a_neg = static_cast<MPType>(alpha <= 0.0f);
+    MPType temp_x_pos = static_cast<MPType>(x > zero);
+    MPType temp_x_neg = static_cast<MPType>(x <= zero);
+    return static_cast<T>(
+        dout * (temp_a_pos * temp_x_pos + temp_a_pos * temp_x_neg * a * exp(x) +
+                temp_a_neg * temp_x_pos * (one + a * exp(x))));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 template <typename DeviceContext, typename Functor>
 class ActivationCudaKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -681,8 +1316,8 @@ class ActivationCudaKernel
     for (auto& attr : attrs) {
       *attr.second = ctx.Attr<float>(attr.first);
     }
-    LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T>(dev_ctx, ins, &outs,
-                                                            functor);
+    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
+        dev_ctx, ins, &outs, functor);
   }
 };
 
@@ -711,17 +1346,17 @@ class ActivationGradCudaKernel
     if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
       // Only need forward output Out
       ins.push_back(out);
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(dev_ctx, ins,
-                                                               &outs, functor);
+      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx, ins, &outs, functor);
     } else if (static_cast<int>(Functor::FwdDeps()) ==
                static_cast<int>(kDepX)) {
       // Only need forward input X
       ins.push_back(x);
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(dev_ctx, ins,
-                                                               &outs, functor);
+      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx, ins, &outs, functor);
     } else {
-      LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T>(dev_ctx, ins,
-                                                              &outs, functor);
+      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
+          dev_ctx, ins, &outs, functor);
     }
   }
 };
@@ -732,23 +1367,6 @@ class ActivationGradCudaKernel
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, op_name, functor,          \
-                                       grad_functor)                        \
-  REGISTER_OP_CUDA_KERNEL(                                                  \
-      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext,  \
-                                      ops::functor<float>>,                 \
-      ops::ActivationKernel<paddle::platform::CUDADeviceContext,            \
-                            ops::functor<double>>,                          \
-      ops::ActivationKernel<plat::CUDADeviceContext,                        \
-                            ops::functor<plat::float16>>);                  \
-  REGISTER_OP_CUDA_KERNEL(                                                  \
-      act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
-                                                 ops::grad_functor<float>>, \
-      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<double>>,                 \
-      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<plat::float16>>);
-
 #define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor,            \
                                         grad_functor)                          \
   REGISTER_OP_CUDA_KERNEL(                                                     \
@@ -767,6 +1385,32 @@ namespace plat = paddle::platform;
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                     ops::grad_functor<plat::float16>>);
 
+#define REGISTER_ACTIVATION_CUDA_KERNEL_INT(act_type, op_name, functor,        \
+                                            grad_functor)                      \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \
+                                          ops::functor<float>>,                \
+      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
+                                ops::functor<double>>,                         \
+      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
+                                ops::functor<int>>,                            \
+      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
+                                ops::functor<int64_t>>,                        \
+      ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
+                                ops::functor<plat::float16>>);                 \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type##_grad,                                                         \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<float>>,                 \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<double>>,                \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<int>>,                   \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<int64_t>>,               \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<plat::float16>>);
+
 /* ======================== leaky relu register  ============================ */
 REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
                                 CudaLeakyReluGradFunctor);
@@ -782,7 +1426,7 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ======================== elu register  ============================ */
-REGISTER_ACTIVATION_GPU_KERNEL(elu, ELU, ELUFunctor, ELUGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(elu, ELU, CudaELUFunctor, CudaELUGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     elu_grad_grad, ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
@@ -794,9 +1438,9 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================    relu register  ============================ */
+#ifdef PADDLE_WITH_HIP
 REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor,
                                 CudaReluGradFunctor);
-
 REGISTER_OP_CUDA_KERNEL(
     relu_grad_grad,
     ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
@@ -805,6 +1449,36 @@ REGISTER_OP_CUDA_KERNEL(
                                     ops::ReluGradGradFunctor<double>>,
     ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
                                     ops::ReluGradGradFunctor<plat::float16>>);
+#else
+REGISTER_OP_CUDA_KERNEL(
+    relu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
+                                    ops::CudaReluFunctor<float>>,
+    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
+                              ops::CudaReluFunctor<double>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaReluFunctor<plat::float16>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaReluFunctor<plat::bfloat16>>);
+REGISTER_OP_CUDA_KERNEL(
+    relu_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                             ops::CudaReluGradFunctor<float>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaReluGradFunctor<double>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaReluGradFunctor<plat::float16>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaReluGradFunctor<plat::bfloat16>>);
+REGISTER_OP_CUDA_KERNEL(
+    relu_grad_grad,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::ReluGradGradFunctor<float>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::ReluGradGradFunctor<double>>,
+    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
+                                    ops::ReluGradGradFunctor<plat::float16>>,
+    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
+                                    ops::ReluGradGradFunctor<plat::bfloat16>>);
+#endif
 /* ========================================================================== */
 
 /* ===========================    tanh register  ============================ */
@@ -851,29 +1525,8 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================  square register  ============================ */
-REGISTER_OP_CUDA_KERNEL(
-    square, ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                                      ops::CudaSquareFunctor<float>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaSquareFunctor<double>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaSquareFunctor<int>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaSquareFunctor<int64_t>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaSquareFunctor<plat::float16>>);
-REGISTER_OP_CUDA_KERNEL(
-    square_grad,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaSquareGradFunctor<float>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaSquareGradFunctor<double>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaSquareGradFunctor<int>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaSquareGradFunctor<int64_t>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaSquareGradFunctor<plat::float16>>);
+REGISTER_ACTIVATION_CUDA_KERNEL_INT(square, Square, CudaSquareFunctor,
+                                    CudaSquareGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     square_grad_grad,
@@ -890,7 +1543,6 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ==========================   pow register  ============================ */
-
 REGISTER_OP_CUDA_KERNEL(
     pow, ops::PowKernel<plat::CUDADeviceContext, ops::PowFunctor<float>>,
     ops::PowKernel<plat::CUDADeviceContext, ops::PowFunctor<double>>,
@@ -908,7 +1560,6 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ==========================   exp register  ============================ */
-
 REGISTER_OP_CUDA_KERNEL(
     exp, ops::ActivationCudaKernel<plat::CUDADeviceContext,
                                    ops::CudaExpFunctor<float>>,
@@ -943,56 +1594,44 @@ REGISTER_OP_CUDA_KERNEL(
                              ops::LogGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
-                                CudaSigmoidGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(silu, Silu, CudaSiluFunctor,
-                                CudaSiluGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,
-                                CudaLogSigmoidGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(atan, Atan, CudaAtanFunctor,
-                                CudaAtanGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor,
-                                CudaSoftShrinkGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(ceil, Ceil, CudaCeilFunctor,
-                                CudaZeroGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(floor, Floor, CudaFloorFunctor,
-                                CudaZeroGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(cos, Cos, CudaCosFunctor, CudaCosGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(tan, Tan, CudaTanFunctor, CudaTanGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(acos, Acos, CudaAcosFunctor,
-                                CudaAcosGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(sin, Sin, CudaSinFunctor, CudaSinGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(asin, Asin, CudaAsinFunctor,
-                                CudaAsinGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(sinh, Sinh, CudaSinhFunctor,
-                                CudaSinhGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(cosh, Cosh, CudaCoshFunctor,
-                                CudaCoshGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(round, Round, CudaRoundFunctor,
-                                CudaZeroGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor,
-                                CudaReciprocalGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(log2, Log2, Log2Functor, Log2GradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(log10, Log10, Log10Functor, Log10GradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(brelu, BRelu, BReluFunctor, BReluGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(soft_relu, SoftRelu, SoftReluFunctor,
-                               SoftReluGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(stanh, STanh, STanhFunctor, STanhGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(softplus, Softplus, SoftplusFunctor,
-                               SoftplusGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(softsign, Softsign, SoftsignFunctor,
-                               SoftsignGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(relu6, Relu6, Relu6Functor, Relu6GradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(tanh_shrink, TanhShrink, TanhShrinkFunctor,
-                               TanhShrinkGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(hard_shrink, HardShrink, HardShrinkFunctor,
-                               HardShrinkGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
-                               HardSigmoidGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(swish, Swish, SwishFunctor, SwishGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(thresholded_relu, ThresholdedRelu,
-                               ThresholdedReluFunctor,
-                               ThresholdedReluGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(hard_swish, HardSwish, HardSwishFunctor,
-                               HardSwishGradFunctor);
+#define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
+  __macro(sigmoid, Sigmoid, CudaSigmoidFunctor, CudaSigmoidGradFunctor);      \
+  __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor);                  \
+  __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
+          CudaLogSigmoidGradFunctor);                                         \
+  __macro(atan, Atan, CudaAtanFunctor, CudaAtanGradFunctor);                  \
+  __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
+          CudaSoftShrinkGradFunctor);                                         \
+  __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor);                  \
+  __macro(floor, Floor, CudaFloorFunctor, CudaZeroGradFunctor);               \
+  __macro(cos, Cos, CudaCosFunctor, CudaCosGradFunctor);                      \
+  __macro(tan, Tan, CudaTanFunctor, CudaTanGradFunctor);                      \
+  __macro(acos, Acos, CudaAcosFunctor, CudaAcosGradFunctor);                  \
+  __macro(sin, Sin, CudaSinFunctor, CudaSinGradFunctor);                      \
+  __macro(asin, Asin, CudaAsinFunctor, CudaAsinGradFunctor);                  \
+  __macro(sinh, Sinh, CudaSinhFunctor, CudaSinhGradFunctor);                  \
+  __macro(cosh, Cosh, CudaCoshFunctor, CudaCoshGradFunctor);                  \
+  __macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor);               \
+  __macro(reciprocal, Reciprocal, CudaReciprocalFunctor,                      \
+          CudaReciprocalGradFunctor);                                         \
+  __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor);              \
+  __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor);                  \
+  __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor);              \
+  __macro(brelu, BRelu, CudaBReluFunctor, CudaBReluGradFunctor);              \
+  __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
+  __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor);              \
+  __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor);  \
+  __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor);  \
+  __macro(relu6, Relu6, CudaRelu6Functor, CudaRelu6GradFunctor);              \
+  __macro(tanh_shrink, TanhShrink, CudaTanhShrinkFunctor,                     \
+          CudaTanhShrinkGradFunctor);                                         \
+  __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor,                     \
+          CudaHardShrinkGradFunctor);                                         \
+  __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor,                  \
+          CudaHardSigmoidGradFunctor);                                        \
+  __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor);              \
+  __macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor,      \
+          CudaThresholdedReluGradFunctor);                                    \
+  __macro(hard_swish, HardSwish, CudaHardSwishFunctor,                        \
+          CudaHardSwishGradFunctor);
+FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index 45b28bf61e5d6..820966addfcff 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
 #include <cmath>
 #include <vector>
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 
@@ -145,16 +146,43 @@ class LazyZerosNPU {
                   const std::vector<bool> found_inf_vec,
                   const std::vector<const framework::Tensor*>& xs,
                   const std::vector<framework::Tensor*>& outs) const {
+    if (!xs.size()) {
+      return;
+    }
+    auto place = dev_ctx.GetPlace();
+    auto stream = dev_ctx.stream();
+    Tensor* zero_tensor;
+    void* zero_ptr;
+    if (found_inf_vec[0]) {
+      int max_num = -1;
+      for (size_t i = 0; i < xs.size(); ++i) {
+        auto* out = outs[i];
+        int num = out->numel();
+        if (max_num < num) {
+          max_num = num;
+          zero_tensor = out;
+        }
+      }
+
+      zero_tensor->mutable_data<T>(place);
+      auto runner_zeros =
+          NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor});
+      runner_zeros.Run(stream);
+      zero_tensor->check_memory_size();
+      zero_ptr = zero_tensor->data<void>();
+    }
+
     for (size_t i = 0; i < xs.size(); ++i) {
       auto* out = outs[i];
-      if (found_inf_vec[0]) {
-        VLOG(4) << "-- UpdateLossScaling: Find infinite grads. --";
-
-        auto place = dev_ctx.GetPlace();
-        auto stream = dev_ctx.stream();
-        auto g = out->mutable_data<T>(place);
-        platform::NPUMemsetAsync(static_cast<void*>(g), 0,
-                                 out->numel() * sizeof(T), stream);
+      auto* x = xs[i];
+      auto dst_ptr = out->mutable_data<T>(place);
+      if (!found_inf_vec[0]) {
+        framework::TensorCopy(*x, place, dev_ctx, out);
+      } else if (zero_ptr != dst_ptr) {
+        auto size = out->numel() * framework::SizeOfType(out->type());
+        memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place), dst_ptr,
+                     BOOST_GET_CONST(platform::NPUPlace, place), zero_ptr, size,
+                     stream);
       }
     }
   }
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 40f4b969ec060..7252ed72b2083 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -90,13 +90,11 @@ REGISTER_OPERATOR(cast, ops::CastOp,
                   ops::CastOpGradMaker<paddle::framework::OpDesc>,
                   ops::CastOpGradMaker<paddle::imperative::OpBase>,
                   ops::CastOpProtoMaker);
-REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
-                       ops::CastOpKernel<CPU, double>,
-                       ops::CastOpKernel<CPU, int>,
-                       ops::CastOpKernel<CPU, int64_t>,
-                       ops::CastOpKernel<CPU, bool>,
-                       ops::CastOpKernel<CPU, uint8_t>,
-                       ops::CastOpKernel<CPU, paddle::platform::float16>,
-                       ops::CastOpKernel<CPU, paddle::platform::bfloat16>,
-                       ops::CastOpKernel<CPU, paddle::platform::complex64>,
-                       ops::CastOpKernel<CPU, paddle::platform::complex128>);
+REGISTER_OP_CPU_KERNEL(
+    cast, ops::CastOpKernel<CPU, float>, ops::CastOpKernel<CPU, double>,
+    ops::CastOpKernel<CPU, int>, ops::CastOpKernel<CPU, int64_t>,
+    ops::CastOpKernel<CPU, bool>, ops::CastOpKernel<CPU, uint8_t>,
+    ops::CastOpKernel<CPU, paddle::platform::float16>,
+    ops::CastOpKernel<CPU, paddle::platform::bfloat16>,
+    ops::CastOpKernel<CPU, paddle::platform::complex<float>>,
+    ops::CastOpKernel<CPU, paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index 13759633d0168..1ac110b3cafd6 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -95,6 +95,7 @@ struct CastOpFunctor<platform::CUDADeviceContext, InT> {
 
 namespace ops = paddle::operators;
 
+#ifdef PADDLE_WITH_HIP
 REGISTER_OP_CUDA_KERNEL(
     cast, ops::CastOpKernel<paddle::platform::CUDADeviceContext, float>,
     ops::CastOpKernel<paddle::platform::CUDADeviceContext, double>,
@@ -105,6 +106,23 @@ REGISTER_OP_CUDA_KERNEL(
     ops::CastOpKernel<paddle::platform::CUDADeviceContext,
                       paddle::platform::float16>,
     ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex64>,
+                      paddle::platform::complex<float>>,
     ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex128>);
+                      paddle::platform::complex<double>>);
+#else
+REGISTER_OP_CUDA_KERNEL(
+    cast, ops::CastOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::bfloat16>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::complex<float>>,
+    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::complex<double>>);
+#endif
diff --git a/paddle/fluid/operators/collective/c_identity_op.cu.cc b/paddle/fluid/operators/collective/c_identity_op.cu.cc
index 8ccf40e317ade..05bb3830b601f 100644
--- a/paddle/fluid/operators/collective/c_identity_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_identity_op.cu.cc
@@ -14,35 +14,11 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_identity_op.h"
 
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CIdentityOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto x = ctx.Input<framework::LoDTensor>("X");
-    auto out = ctx.Output<framework::LoDTensor>("Out");
-
-    int rid = ctx.Attr<int>("ring_id");
-    PADDLE_ENFORCE_GE(
-        rid, 0,
-        platform::errors::InvalidArgument(
-            "The ring_id (%d) for c_identity op must be non-negative.", rid));
-    out->mutable_data<T>(ctx.GetPlace());
-
-    TensorCopy(*x, out->place(), out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(c_identity, ops::CIdentityOpCUDAKernel<float>,
-                        ops::CIdentityOpCUDAKernel<double>,
-                        ops::CIdentityOpCUDAKernel<int>,
-                        ops::CIdentityOpCUDAKernel<int64_t>,
-                        ops::CIdentityOpCUDAKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(c_identity, ops::CIdentityOpKernel<float>,
+                        ops::CIdentityOpKernel<double>,
+                        ops::CIdentityOpKernel<int>,
+                        ops::CIdentityOpKernel<int64_t>,
+                        ops::CIdentityOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_identity_op.h b/paddle/fluid/operators/collective/c_identity_op.h
index ca817fb6bac0e..c8577a9617489 100644
--- a/paddle/fluid/operators/collective/c_identity_op.h
+++ b/paddle/fluid/operators/collective/c_identity_op.h
@@ -34,5 +34,23 @@ class CIdentityOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class CIdentityOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+
+    int rid = ctx.Attr<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        rid, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_identity op must be non-negative.", rid));
+    out->mutable_data<T>(ctx.GetPlace());
+
+    TensorCopy(*x, out->place(), out);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_identity_op_npu.cc b/paddle/fluid/operators/collective/c_identity_op_npu.cc
new file mode 100644
index 0000000000000..a822bd11a4a83
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_identity_op_npu.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_identity_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_identity, ops::CIdentityOpKernel<float>,
+                       ops::CIdentityOpKernel<double>,
+                       ops::CIdentityOpKernel<int>,
+                       ops::CIdentityOpKernel<int64_t>,
+                       ops::CIdentityOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
index 69f1f4681a33d..52a23c50c0e11 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -27,10 +27,11 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-    auto x = ctx.Output<framework::LoDTensor>("Out");
-    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
-    int numel = x->numel();
-    HcclDataType dtype = platform::ToHCCLDataType(x->type());
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(out->dims(), ctx.GetPlace());
+    void* ptr = reinterpret_cast<void*>(const_cast<T*>(out->data<T>()));
+    int numel = out->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(out->type());
 
     int ring_id = ctx.Attr<int>("ring_id");
     auto place = ctx.GetPlace();
@@ -54,8 +55,10 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
     int root = peer;
 
     VLOG(3) << "begin hccl recv, parameter is: "
-            << "root " << root << ", comm: " << comm->comm()
-            << ", stream: " << stream;
+            << "ring_id:" << ring_id << ", nranks:" << nranks
+            << ", peer:" << peer << ", numel:" << numel << ", ptr:" << ptr
+            << ", dtype:" << dtype << ", root:" << root
+            << ", comm: " << comm->comm() << ", stream: " << stream;
 
     PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
         ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index bbc42d97146f2..68a52a79e4ce3 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -233,7 +233,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::ConcatKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ConcatKernel<paddle::platform::CPUDeviceContext,
                       paddle::platform::float16>,
-    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int>);
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, uint8_t>);
 REGISTER_OP_CPU_KERNEL(
     concat_grad,
     ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, double>,
@@ -242,4 +243,5 @@ REGISTER_OP_CPU_KERNEL(
     ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ConcatGradKernel<paddle::platform::CPUDeviceContext,
                           paddle::platform::float16>,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int>);
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, uint8_t>);
diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc
index 8c30703f2576b..8732556acb9fd 100644
--- a/paddle/fluid/operators/concat_op.cu.cc
+++ b/paddle/fluid/operators/concat_op.cu.cc
@@ -23,7 +23,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ConcatKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::ConcatKernel<paddle::platform::CUDADeviceContext, plat::float16>,
     ops::ConcatKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int>);
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, uint8_t>);
 REGISTER_OP_CUDA_KERNEL(
     concat_grad,
     ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, double>,
@@ -31,4 +32,5 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
     ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int>);
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, uint8_t>);
diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc
index 3afe4f1e3d102..4d801bc003ea9 100644
--- a/paddle/fluid/operators/conj_op.cc
+++ b/paddle/fluid/operators/conj_op.cc
@@ -78,9 +78,9 @@ REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker,
 
 REGISTER_OP_CPU_KERNEL(
     conj, ops::ConjKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::complex64>,
+                          paddle::platform::complex<float>>,
     ops::ConjKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex128>,
+                    paddle::platform::complex<double>>,
     ops::ConjKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ConjKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ConjKernel<paddle::platform::CPUDeviceContext, int>,
diff --git a/paddle/fluid/operators/conj_op.cu b/paddle/fluid/operators/conj_op.cu
index 601caeb505588..d04024d70a8ea 100644
--- a/paddle/fluid/operators/conj_op.cu
+++ b/paddle/fluid/operators/conj_op.cu
@@ -13,15 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/conj_op.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     conj, ops::ConjKernel<paddle::platform::CUDADeviceContext,
-                          paddle::platform::complex64>,
+                          paddle::platform::complex<float>>,
     ops::ConjKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex128>,
+                    paddle::platform::complex<double>>,
     ops::ConjKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ConjKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ConjKernel<paddle::platform::CUDADeviceContext, int>,
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index bf047de86fc21..a03e4165755dd 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -131,18 +131,18 @@ class CompareOp : public framework::OperatorWithKernel {
 
 REGISTER_COMPARE_OP(less_than, "Out = X < Y");
 REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor,
-                        paddle::operators::GreaterEqualFunctor);
+                        paddle::operators::GreaterThanFunctor);
 REGISTER_COMPARE_OP(less_equal, "Out = X <= Y");
 REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor,
-                        paddle::operators::GreaterThanFunctor);
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_COMPARE_OP(greater_than, "Out = X > Y");
 REGISTER_COMPARE_KERNEL(greater_than, CPU,
                         paddle::operators::GreaterThanFunctor,
-                        paddle::operators::LessEqualFunctor);
+                        paddle::operators::LessThanFunctor);
 REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y");
 REGISTER_COMPARE_KERNEL(greater_equal, CPU,
                         paddle::operators::GreaterEqualFunctor,
-                        paddle::operators::LessThanFunctor);
+                        paddle::operators::LessEqualFunctor);
 REGISTER_COMPARE_OP(equal, "Out = X == Y");
 REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor,
                         paddle::operators::EqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu
index 3ca700e16e6e7..a60201f9d07d6 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_op.cu
@@ -15,15 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/compare_op.h"
 
 REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor,
-                        paddle::operators::GreaterEqualFunctor);
-REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor,
                         paddle::operators::GreaterThanFunctor);
+REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor,
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_COMPARE_KERNEL(greater_than, CUDA,
                         paddle::operators::GreaterThanFunctor,
-                        paddle::operators::LessEqualFunctor);
+                        paddle::operators::LessThanFunctor);
 REGISTER_COMPARE_KERNEL(greater_equal, CUDA,
                         paddle::operators::GreaterEqualFunctor,
-                        paddle::operators::LessThanFunctor);
+                        paddle::operators::LessEqualFunctor);
 REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor,
                         paddle::operators::EqualFunctor);
 REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor,
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
index 22eb2ece4b05b..7ce63aa9cbbfa 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
@@ -19,6 +19,7 @@
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op.h"
+#include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/operators/decode_jpeg_op.cc b/paddle/fluid/operators/decode_jpeg_op.cc
index e553b1076a864..dd82c74885b94 100644
--- a/paddle/fluid/operators/decode_jpeg_op.cc
+++ b/paddle/fluid/operators/decode_jpeg_op.cc
@@ -19,7 +19,6 @@
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/dynload/nvjpeg.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/decode_jpeg_op.cu b/paddle/fluid/operators/decode_jpeg_op.cu
index 35975a6a54986..11616b0e0c4da 100644
--- a/paddle/fluid/operators/decode_jpeg_op.cu
+++ b/paddle/fluid/operators/decode_jpeg_op.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP)
 
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc
index 26f12e8f9e3bf..31acd9718115c 100644
--- a/paddle/fluid/operators/dot_op.cc
+++ b/paddle/fluid/operators/dot_op.cc
@@ -33,7 +33,7 @@ class DotOp : public framework::OperatorWithKernel {
                           "Output(Out) of DotOp should not be null."));
 
     auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = (size_t)x_dims.size();
+    auto x_rank = static_cast<size_t>(x_dims.size());
     PADDLE_ENFORCE_EQ(true, 1 == x_rank || 2 == x_rank,
                       platform::errors::PreconditionNotMet(
                           "ShapeError: The dimensions of input tensor X (%s) "
@@ -154,15 +154,15 @@ REGISTER_OP_CPU_KERNEL(
     ops::DotKernel<paddle::platform::CPUDeviceContext, int>,
     ops::DotKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::DotKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex64>,
+                   paddle::platform::complex<float>>,
     ops::DotKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex128>);
+                   paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     dot_grad, ops::DotGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::DotGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::DotGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::DotGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::DotGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex64>,
+                       paddle::platform::complex<float>>,
     ops::DotGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex128>);
+                       paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/dot_op.cu b/paddle/fluid/operators/dot_op.cu
index 2d259ba1fbc9b..49f27e1ffb128 100644
--- a/paddle/fluid/operators/dot_op.cu
+++ b/paddle/fluid/operators/dot_op.cu
@@ -22,12 +22,14 @@ REGISTER_OP_CUDA_KERNEL(
     ops::DotKernel<plat::CUDADeviceContext, double>,
     ops::DotKernel<plat::CUDADeviceContext, int>,
     ops::DotKernel<plat::CUDADeviceContext, int64_t>,
-    ops::DotKernel<plat::CUDADeviceContext, paddle::platform::complex64>,
-    ops::DotKernel<plat::CUDADeviceContext, paddle::platform::complex128>);
-REGISTER_OP_CUDA_KERNEL(
-    dot_grad, ops::DotGradKernel<plat::CUDADeviceContext, float>,
-    ops::DotGradKernel<plat::CUDADeviceContext, double>,
-    ops::DotGradKernel<plat::CUDADeviceContext, int>,
-    ops::DotGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::DotGradKernel<plat::CUDADeviceContext, paddle::platform::complex64>,
-    ops::DotGradKernel<plat::CUDADeviceContext, paddle::platform::complex128>);
+    ops::DotKernel<plat::CUDADeviceContext, paddle::platform::complex<float>>,
+    ops::DotKernel<plat::CUDADeviceContext, paddle::platform::complex<double>>);
+REGISTER_OP_CUDA_KERNEL(dot_grad,
+                        ops::DotGradKernel<plat::CUDADeviceContext, float>,
+                        ops::DotGradKernel<plat::CUDADeviceContext, double>,
+                        ops::DotGradKernel<plat::CUDADeviceContext, int>,
+                        ops::DotGradKernel<plat::CUDADeviceContext, int64_t>,
+                        ops::DotGradKernel<plat::CUDADeviceContext,
+                                           paddle::platform::complex<float>>,
+                        ops::DotGradKernel<plat::CUDADeviceContext,
+                                           paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 5c444e752e797..a4b97301a2611 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
@@ -39,15 +39,24 @@ struct CudaAddFunctor {
 };
 
 template <typename T>
-struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
+class ElementwiseAddKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* z = ctx.Output<framework::LoDTensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    axis = axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis;
+
     std::vector<const framework::Tensor*> ins = {x, y};
     std::vector<framework::Tensor*> outs = {z};
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(
-        ctx.template device_context<platform::CUDADeviceContext>(), ins, &outs,
-        CudaAddFunctor<T>());
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaAddFunctor<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index abea9da942355..ec7d036a1a1e0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -20,11 +20,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifdef __NVCC__
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include "cub/cub.cuh"
+
 #endif
 #ifdef __HIPCC__
 #include <hip/hip_fp16.h>
@@ -38,9 +40,10 @@ namespace paddle {
 namespace operators {
 
 template <typename DeviceContext, typename T>
-void default_elementwise_add(const framework::ExecutionContext &ctx,
-                             const framework::Tensor *x,
-                             const framework::Tensor *y, framework::Tensor *z) {
+void LaunchBroadcastElementwiseCpuKernel(const framework::ExecutionContext &ctx,
+                                         const framework::Tensor *x,
+                                         const framework::Tensor *y,
+                                         framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
   auto x_dims = x->dims();
   auto y_dims = y->dims();
@@ -68,12 +71,13 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto *y = ctx.Input<framework::LoDTensor>("Y");
     auto *z = ctx.Output<framework::LoDTensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
-    auto dims_equal = x->dims() == y->dims();
-    if (dims_equal) {
-      SameDimsElemwiseAdd<DeviceContext, T> same_dims_add;
-      same_dims_add(ctx, x, y, z);
+    if (x->dims() == y->dims()) {
+      SameDimsElemwiseAdd<platform::CPUDeviceContext, T>
+          LaunchElementwiseCpuKernel;
+      LaunchElementwiseCpuKernel(ctx, x, y, z);
     } else {
-      default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
+      LaunchBroadcastElementwiseCpuKernel<platform::CPUDeviceContext, T>(ctx, x,
+                                                                         y, z);
     }
   }
 };
@@ -459,8 +463,8 @@ class ElementwiseAddDoubleGradKernel : public framework::OpKernel<T> {
       GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
 
       ddout->mutable_data<T>(ctx.GetPlace());
-      default_elementwise_add<DeviceContext, T>(ctx, &ddx_safe, &ddy_safe,
-                                                ddout);
+      LaunchBroadcastElementwiseCpuKernel<DeviceContext, T>(ctx, &ddx_safe,
+                                                            &ddy_safe, ddout);
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
index 8d99aa2798568..8b902acebb4c5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
@@ -141,6 +141,7 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
       }
     }
 
+    const T* dz_data = dz->data<T>();
     T* dx_data = nullptr;
     T* dy_data = nullptr;
     if (dx) {
@@ -152,9 +153,9 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
 
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::XPUDeviceContext>();
-    int ret = xpu::broadcast_add_grad<T>(dev_ctx.x_context(), dx_data, dx_data,
-                                         dx_data, dz->data<T>(), dy_data,
-                                         dx_data, x_dims_vec, y_dims_vec);
+    int ret = xpu::broadcast_add_grad<T>(dev_ctx.x_context(), dz_data, dz_data,
+                                         dz_data, dz_data, dy_data, dx_data,
+                                         x_dims_vec, y_dims_vec);
     PADDLE_ENFORCE_EQ(
         ret, xpu::SUCCESS,
         platform::errors::External(
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
new file mode 100644
index 0000000000000..1492fc629457c
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -0,0 +1,525 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.1
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
+
+namespace paddle {
+namespace operators {
+
+struct DimensionsTransform {
+  using DimVector = std::vector<int64_t>;
+  typedef void (*MergeFunctor)(bool &, std::vector<DimVector> &, DimVector &,
+                               int, int);
+  int64_t dim_size;
+  DimVector out_dims;
+  std::vector<DimVector> in_dims;
+
+ private:
+  // To compensate the lackage of input_tensors` dimension with input variable
+  // 'axis'
+  void InputDimensionsExtend(int N, int axis) {
+    for (auto &in_dim : in_dims) {
+      int64_t in_idx = 0;
+      if (in_dim.size() < dim_size) {
+        DimVector tmp_dim(dim_size, 1);
+        do {
+          if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) {
+            tmp_dim[axis] = in_dim[in_idx];
+            in_idx++;
+            axis++;
+          } else {
+            PADDLE_THROW(platform::errors::InvalidArgument(
+                "The %dth dimension of input tensor is expected to be equal "
+                "with"
+                "the %dth dimension of output tensor %d or 1, but recieved "
+                "%d.\n",
+                in_idx + 1, axis + 1, out_dims[axis], in_dim[in_idx]));
+          }
+        } while (in_idx < in_dim.size());
+        in_dim.resize(dim_size);
+        std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin());
+      } else {
+        do {
+          if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) {
+            in_idx++;
+          } else {
+            PADDLE_THROW(platform::errors::InvalidArgument(
+                "The %dth dimension of input tensor is expected to be equal "
+                "with"
+                "the %dth dimension of output tensor %d or 1, but recieved "
+                "%d.\n",
+                in_idx + 1, in_idx + 1, out_dims[in_idx], in_dim[in_idx]));
+          }
+        } while (in_idx < dim_size);
+      }
+      std::reverse(in_dim.begin(), in_dim.end());
+    }
+    std::reverse(out_dims.begin(), out_dims.end());
+  }
+
+  template <typename MergeFunctor>
+  __inline__ void MergeDimensions(MergeFunctor merge_func, int N) {
+    auto VectorReorganise = [](DimVector *vec, int l_idx, int m_idx) {
+      (*vec)[m_idx - 1] =
+          std::accumulate(vec->begin() + l_idx, vec->begin() + m_idx, 1,
+                          std::multiplies<int64_t>());
+      vec->erase(vec->begin() + l_idx, vec->begin() + m_idx - 1);
+    };
+
+    int64_t i = 0;
+    while (i < dim_size) {
+      int cnt = 0;
+      int low_idx = i;
+      bool equal = true;
+      do {
+        merge_func(equal, in_dims, out_dims, i, N);
+        if (equal) {
+          i++;
+          cnt++;
+        } else {
+          break;
+        }
+      } while (i < dim_size);
+
+      if (cnt > 1) {
+        for (auto &in_dim : in_dims) {
+          VectorReorganise(&in_dim, low_idx, i);
+        }
+        VectorReorganise(&out_dims, low_idx, i);
+        dim_size -= --cnt;
+        i -= cnt;
+      } else if (cnt < 1) {
+        i++;
+      }
+    }
+  }
+
+ public:
+  explicit DimensionsTransform(
+      const std::vector<const framework::Tensor *> &ins,
+      const framework::DDim &dims, int axis) {
+    const int N = ins.size();
+    dim_size = dims.size();
+    out_dims = framework::vectorize<int64_t>(dims);
+    in_dims.resize(N);
+    for (int j = 0; j < N; ++j) {
+      in_dims[j] = framework::vectorize<int64_t>(ins[j]->dims());
+    }
+    InputDimensionsExtend(N, axis);
+
+    auto merge_sequential_dims = [](bool &equal,
+                                    std::vector<DimVector> &in_dims,
+                                    DimVector &out, int i, int num) {
+      for (int j = 1; j < num; ++j) {
+        equal = (in_dims[0][i] == in_dims[j][i]) ? true : false;
+      }
+    };
+    auto merge_sequential_one_dims = [](bool &equal,
+                                        std::vector<DimVector> &in_dims,
+                                        DimVector &out, int i, int num) {
+      equal = in_dims[0][i] == 1;
+      if (equal) {
+        for (int j = 1; j < num; ++j) {
+          equal = in_dims[j][i] == out[i];
+        }
+      }
+    };
+    // To Merge the dimensions of input_tensors while the consequtive
+    // equal-dimensions appears.
+    MergeFunctor merge_ptr = merge_sequential_dims;
+    MergeDimensions<MergeFunctor>(merge_ptr, N);
+
+    int min_idx = 0;
+    int min_val = std::accumulate(in_dims[0].begin(), in_dims[0].end(), 1,
+                                  std::multiplies<int64_t>());
+    for (int j = 1; j < N; ++j) {
+      int temp = std::accumulate(in_dims[j].begin(), in_dims[j].end(), 1,
+                                 std::multiplies<int64_t>());
+      min_val = min_val > temp ? temp : min_val;
+      min_idx = min_val == temp ? j : min_idx;
+    }
+    std::swap(in_dims[0], in_dims[min_idx]);
+
+    // To Merge the dimension of input_tensors while the consequtive
+    // 1-value-dimensions appears.
+    merge_ptr = merge_sequential_one_dims;
+    MergeDimensions<MergeFunctor>(merge_ptr, N);
+    std::swap(in_dims[min_idx], in_dims[0]);
+  }
+};
+
+struct StridesCalculation {
+  std::vector<std::vector<uint32_t>> strides;
+  std::vector<FastDivMod> divmoders;
+
+ private:
+  // To calculate the strides of each input_tensor.
+  __inline__ void CalculateStrides(
+      int N, int dim_size, const std::vector<std::vector<int64_t>> &in_dims) {
+    for (int j = 0; j < N; ++j) {
+      for (int i = 0; i < dim_size; ++i) {
+        strides[j][i] = in_dims[j][i] == 1 ? 0 : strides[j][i];
+        strides[j][i] =
+            (i != 0 && strides[j][i] != 0)
+                ? std::accumulate(in_dims[j].begin(), in_dims[j].begin() + i, 1,
+                                  std::multiplies<int64_t>())
+                : strides[j][i];
+      }
+    }
+  }
+
+ public:
+  explicit StridesCalculation(const int64_t &dim_size,
+                              const std::vector<std::vector<int64_t>> &in_dims,
+                              const std::vector<int64_t> &out_dims) {
+    const auto N = in_dims.size();
+    divmoders.resize(dim_size);
+    strides.resize(N, std::vector<uint32_t>(dim_size, 1));
+
+    for (int i = 0; i < dim_size; ++i) {
+      divmoders[i] = FastDivMod(out_dims[i]);
+    }
+    CalculateStrides(N, dim_size, in_dims);
+  }
+};
+
+template <typename InT, typename OutT, typename Functor, ElementwiseType ET,
+          int VecSize, int kDims>
+struct BroadcastArgsWarpper {
+  using InVecType = CudaAlignedVector<InT, VecSize>;
+  using OutVecType = CudaAlignedVector<OutT, VecSize>;
+
+  OutT *out_data;
+  OutVecType *vec_out_data;
+  const InT *__restrict__ in_data[ET];
+  const InVecType *__restrict__ vec_in_data[ET];
+  bool no_broadcast[ET];
+  FastDivMod divmoders[kDims];
+  uint32_t strides[ET][framework::DDim::kMaxRank];
+  uint32_t scalar_cal_offset;
+  Functor func;
+
+  HOSTDEVICE BroadcastArgsWarpper(
+      const std::vector<const framework::Tensor *> &ins, framework::Tensor *out,
+      int scalar_cal_offset, Functor func,
+      const StridesCalculation &offset_calculator)
+      : scalar_cal_offset(scalar_cal_offset), func(func) {
+    for (int j = 0; j < ET; ++j) {
+      in_data[j] = ins[j]->data<InT>();
+      vec_in_data[j] = reinterpret_cast<const InVecType *>(in_data[j]);
+      no_broadcast[j] = ins[j]->dims() == out->dims() ? true : false;
+      memcpy(strides[j], offset_calculator.strides[j].data(),
+             kDims * sizeof(uint32_t));
+    }
+    out_data = out->data<OutT>();
+    vec_out_data = reinterpret_cast<OutVecType *>(out_data);
+    memcpy(divmoders, offset_calculator.divmoders.data(),
+           kDims * sizeof(FastDivMod));
+  }
+
+  __device__ __forceinline__ uint32_t GetOffsetByDivmod(int idx, int in_idx) {
+    uint32_t offset = 0;
+
+#pragma unroll(kDims)
+    for (int i = 0; i < kDims; ++i) {
+      auto fast_divmoder = divmoders[i].Divmod(idx);
+      idx = fast_divmoder.val[0];
+      offset += fast_divmoder.val[1] * strides[in_idx][i];
+    }
+    return offset;
+  }
+
+  __device__ __forceinline__ void LoadVectorizedDataCommon(
+      InVecType *vector_args, int tid, int idx) {
+    *vector_args = vec_in_data[idx][tid];
+  }
+
+  __device__ __forceinline__ void LoadVectorizedDataByDivmod(InT *scalar_args,
+                                                             int tid, int idx) {
+    int index = tid * VecSize;
+#pragma unroll(VecSize)
+    for (int i = 0; i < VecSize; ++i) {
+      uint32_t offset = GetOffsetByDivmod(index + i, idx);
+      scalar_args[i] = in_data[idx][offset];
+    }
+  }
+
+  __device__ __forceinline__ void LoadScalarizedDataCommon(InT args[], int tid,
+                                                           int idx) {
+    args[idx] = in_data[idx][tid + scalar_cal_offset];
+  }
+
+  __device__ __forceinline__ void LoadScalarizedDataByDivmod(InT args[],
+                                                             int tid, int idx) {
+    auto offset = GetOffsetByDivmod(tid + scalar_cal_offset, idx);
+    args[idx] = in_data[idx][offset];
+  }
+
+  __device__ __forceinline__ void LoadVectorizedData(InT (*args)[VecSize],
+                                                     int tid) {
+#pragma unroll(ET)
+    for (int j = 0; j < ET; ++j) {
+      if (no_broadcast[j]) {
+        InVecType *vector_args = reinterpret_cast<InVecType *>(args[j]);
+        LoadVectorizedDataCommon(vector_args, tid, j);
+      } else {
+        LoadVectorizedDataByDivmod(args[j], tid, j);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void LoadScalarizedData(InT args[], int tid) {
+#pragma unroll(ET)
+    for (int j = 0; j < ET; ++j) {
+      if (no_broadcast[j]) {
+        LoadScalarizedDataCommon(args, tid, j);
+      } else {
+        LoadScalarizedDataByDivmod(args, tid, j);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void StoreVectorizedData(OutVecType vec_args_out,
+                                                      int tid) {
+    vec_out_data[tid] = vec_args_out;
+  }
+
+  __device__ __forceinline__ void StoreScalarizedData(OutT args_out, int tid) {
+    out_data[scalar_cal_offset + tid] = args_out;
+  }
+};
+
+template <typename InT, typename OutT, typename BroadcastArgsWarpper,
+          ElementwiseType ET>
+__device__ inline void ScalarizedBroadcastKernelImpl(
+    BroadcastArgsWarpper broadcast_warpper, int tid) {
+  InT args[ET];
+  OutT args_out;
+  broadcast_warpper.LoadScalarizedData(args, tid);
+
+#pragma unroll(ET)
+  for (int j = 1; j < ET; ++j) {
+    args_out = broadcast_warpper.func(args);
+  }
+  broadcast_warpper.StoreScalarizedData(args_out, tid);
+}
+
+template <typename InT, typename OutT, typename BroadcastArgsWarpper,
+          ElementwiseType ET, int VecSize>
+__device__ inline void VectorizedBroadcastKernelImpl(
+    BroadcastArgsWarpper broadcast_warpper, int tid) {
+  using OutVecType = CudaAlignedVector<OutT, VecSize>;
+  OutVecType args_out;
+  InT ins[ET];
+  InT args[ET][VecSize];
+  broadcast_warpper.LoadVectorizedData(args, tid);
+
+#pragma unroll(VecSize)
+  for (int i = 0; i < VecSize; ++i) {
+#pragma unroll(ET)
+    for (int j = 0; j < ET; ++j) {
+      ins[j] = args[j][i];
+    }
+    args_out.val[i] = broadcast_warpper.func(ins);
+  }
+  broadcast_warpper.StoreVectorizedData(args_out, tid);
+}
+
+template <typename InT, typename OutT, typename BroadcastArgsWarpper,
+          ElementwiseType ET, int VecSize>
+__global__ void ElementwiseBroadcastKernel(
+    BroadcastArgsWarpper broadcast_warpper, int main_tid, int tail_tid) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // Vectorized calculation of major data whose length is the max multipler of
+  // VecSize,
+  // eg: Calcualting the front 1024-length data in total 1027 data once VecSize
+  // is 4.
+  if (tid < main_tid) {
+    VectorizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWarpper, ET, VecSize>(
+        broadcast_warpper, tid);
+  }
+  // Scalarzed calculation of rest data whose lenght cannot fulfill VecSize.
+  // eg: Calcualting the rest 3-length data in total 1027 data once VecSize is
+  // 4.
+  if (tid < tail_tid) {
+    ScalarizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWarpper, ET>(
+        broadcast_warpper, tid);
+  }
+}
+
+template <typename InT, typename OutT, ElementwiseType ET, int VecSize,
+          typename Functor>
+void LaunchBroadcastKernelForDifferentDimSize(
+    const platform::CUDADeviceContext &ctx,
+    const std::vector<const framework::Tensor *> &ins, framework::Tensor *out,
+    int axis, Functor func) {
+  int numel = out->numel();
+  const int threads = 256;
+  int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
+  int main_tid = numel / VecSize;
+  int tail_tid = numel % VecSize;
+  int vec_len = main_tid * VecSize;
+  auto stream = ctx.stream();
+
+  const auto merge_dims = DimensionsTransform(ins, out->dims(), axis);
+  const auto offset_calculator = StridesCalculation(
+      merge_dims.dim_size, merge_dims.in_dims, merge_dims.out_dims);
+
+  switch (merge_dims.dim_size) {
+    case 1: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 1>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    case 2: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 2>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    case 3: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 3>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    case 4: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 4>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    case 5: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 5>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    case 6: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 6>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    case 7: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 7>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    case 8: {
+      auto broadcast_warpper =
+          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 8>(
+              ins, out, vec_len, func, offset_calculator);
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+                                 VecSize><<<blocks, threads, 0, stream>>>(
+          broadcast_warpper, main_tid, tail_tid);
+      break;
+    }
+    default: {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The maximum dimension of input tensor is expected to be less than "
+          "%d, but recieved %d.\n",
+          merge_dims.dim_size, framework::DDim::kMaxRank));
+    }
+  }
+}
+
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+void LaunchBroadcastElementwiseCudaKernel(
+    const platform::CUDADeviceContext &ctx,
+    const std::vector<const framework::Tensor *> &ins,
+    std::vector<framework::Tensor *> *outs, int axis, Functor func) {
+  static_assert(ET == (ElementwiseType)2, "Only Support binary calculation.");
+  int in_vec_size = 4;
+  framework::Tensor *out = (*outs)[0];
+  for (auto *in : ins) {
+    auto temp_size = GetVectorizedSizeImpl<InT>(in->data<InT>());
+    in_vec_size = in->dims() == out->dims() ? std::min(temp_size, in_vec_size)
+                                            : in_vec_size;
+  }
+  int out_vec_size = GetVectorizedSizeImpl<OutT>(out->data<OutT>());
+  int vec_size = std::min(out_vec_size, in_vec_size);
+
+  switch (vec_size) {
+    case 4: {
+      LaunchBroadcastKernelForDifferentDimSize<InT, OutT, ET, 4>(ctx, ins, out,
+                                                                 axis, func);
+      break;
+    }
+    case 2: {
+      LaunchBroadcastKernelForDifferentDimSize<InT, OutT, ET, 2>(ctx, ins, out,
+                                                                 axis, func);
+      break;
+    }
+    case 1: {
+      LaunchBroadcastKernelForDifferentDimSize<InT, OutT, ET, 1>(ctx, ins, out,
+                                                                 axis, func);
+      break;
+    }
+    default: {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+    }
+  }
+}
+
+template <ElementwiseType ET, typename InT, typename OutType, typename Functor>
+void LaunchElementwiseCudaKernel(
+    const platform::CUDADeviceContext &cuda_ctx,
+    const std::vector<const framework::Tensor *> &ins,
+    std::vector<framework::Tensor *> *outs, int axis, Functor func) {
+  bool no_broadcast_flag = true;
+  for (auto *in : ins) {
+    no_broadcast_flag = ins[0]->dims() == in->dims();
+  }
+
+  if (no_broadcast_flag) {
+    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, InT, OutType>(
+        cuda_ctx, ins, outs, func);
+  } else {
+    LaunchBroadcastElementwiseCudaKernel<ElementwiseType::kBinary, InT,
+                                         OutType>(cuda_ctx, ins, outs, axis,
+                                                  func);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 321826ec647c9..33a2b7e182f0a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -15,8 +15,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/fast_divmod.h"
 
 #ifdef __HIPCC__
 #define ELEMENTWISE_BLOCK_SIZE 256
@@ -29,11 +28,6 @@ namespace operators {
 
 enum ElementwiseType { kUnary = 1, kBinary = 2 };
 
-template <typename T, int Size>
-struct alignas(sizeof(T) * Size) CudaAlignedVector {
-  T val[Size];
-};
-
 template <typename T>
 int GetVectorizedSizeImpl(const T *pointer) {
   uint64_t address = reinterpret_cast<uint64_t>(pointer);
@@ -49,69 +43,73 @@ int GetVectorizedSizeImpl(const T *pointer) {
   return 1;
 }
 
-template <typename T>
+template <typename InT, typename OutT>
 int GetVectorizedSize(const std::vector<const framework::Tensor *> &ins,
                       const std::vector<framework::Tensor *> &outs) {
   int vec_size = 4;
   for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
     vec_size =
-        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<T>()));
+        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<InT>()));
   }
   for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
     vec_size =
-        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<T>()));
+        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<OutT>()));
   }
   return vec_size;
 }
 
-template <ElementwiseType ET, int VecSize, typename T>
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT>
 struct ElementwiseDataWrapper {
-  T *out;
-  const T *in0;
-  const T *in1;
-  __device__ ElementwiseDataWrapper(T *out, const T *in0,
-                                    const T *in1 = nullptr)
+  OutT *out;
+  const InT *in0;
+  const InT *in1;
+  __device__ ElementwiseDataWrapper(OutT *out, const InT *in0,
+                                    const InT *in1 = nullptr)
       : out(out), in0(in0), in1(in1) {}
 
-  using VecType = CudaAlignedVector<T, VecSize>;
+  using InVecType = CudaAlignedVector<InT, VecSize>;
+  using OutVecType = CudaAlignedVector<OutT, VecSize>;
 
-  inline __device__ void load_vector(VecType args[], int idx) {
-    const VecType *x_vec = reinterpret_cast<const VecType *>(in0);
+  inline __device__ void load_vector(InVecType args[], int idx) {
+    const InVecType *x_vec = reinterpret_cast<const InVecType *>(in0);
     args[0] = x_vec[idx];
     if (ET == ElementwiseType::kBinary) {
-      const VecType *y_vec = reinterpret_cast<const VecType *>(in1);
+      const InVecType *y_vec = reinterpret_cast<const InVecType *>(in1);
       args[1] = y_vec[idx];
     }
   }
 
-  inline __device__ void load_scalar(T args[], int idx) {
+  inline __device__ void load_scalar(InT args[], int idx) {
     args[0] = in0[idx];
     if (ET == ElementwiseType::kBinary) {
       args[1] = in1[idx];
     }
   }
 
-  inline __device__ void store_vector(VecType res, int idx) {
-    VecType *out_vec = reinterpret_cast<VecType *>(out);
+  inline __device__ void store_vector(OutVecType res, int idx) {
+    OutVecType *out_vec = reinterpret_cast<OutVecType *>(out);
     out_vec[idx] = res;
   }
 
-  inline __device__ void store_scalar(T res, int idx) { out[idx] = res; }
+  inline __device__ void store_scalar(OutT res, int idx) { out[idx] = res; }
 };
 
-template <ElementwiseType ET, int VecSize, typename T, typename Functor>
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
+          typename Functor>
 __device__ void VectorizedKernelImpl(
-    ElementwiseDataWrapper<ET, VecSize, T> data, Functor func, int tid) {
-  using VecType = CudaAlignedVector<T, VecSize>;
-  VecType ins_vec[ET];
-  VecType out_vec;
-  T *ins_ptr[ET];
-  T *out_ptr;
+    ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func,
+    int tid) {
+  using InVecType = CudaAlignedVector<InT, VecSize>;
+  using OutVecType = CudaAlignedVector<OutT, VecSize>;
+  InVecType ins_vec[ET];
+  OutVecType out_vec;
+  InT *ins_ptr[ET];
+  OutT *out_ptr;
 #pragma unroll
   for (int i = 0; i < ET; ++i) {
-    ins_ptr[i] = reinterpret_cast<T *>(&(ins_vec[i]));
+    ins_ptr[i] = reinterpret_cast<InT *>(&(ins_vec[i]));
   }
-  out_ptr = reinterpret_cast<T *>(&out_vec);
+  out_ptr = reinterpret_cast<OutT *>(&out_vec);
 
   // load
   data.load_vector(ins_vec, tid);
@@ -119,7 +117,7 @@ __device__ void VectorizedKernelImpl(
 // compute
 #pragma unroll
   for (int i = 0; i < VecSize; ++i) {
-    T ins[ET];
+    InT ins[ET];
 #pragma unroll
     for (int j = 0; j < ET; ++j) {
       ins[j] = ins_ptr[j][i];
@@ -131,11 +129,13 @@ __device__ void VectorizedKernelImpl(
   data.store_vector(out_vec, tid);
 }
 
-template <ElementwiseType ET, int VecSize, typename T, typename Functor>
-__device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, VecSize, T> data,
-                                 Functor func, int start, int remain) {
-  T ins[ET];
-  T out;
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
+          typename Functor>
+__device__ void ScalarKernelImpl(
+    ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func,
+    int start, int remain) {
+  InT ins[ET];
+  OutT out;
 
   for (int i = 0; i < remain; ++i) {
     int idx = start + i;
@@ -148,14 +148,15 @@ __device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, VecSize, T> data,
   }
 }
 
-template <ElementwiseType ET, int VecSize, typename T, typename Functor>
-__global__ void VectorizedKernel(const T *__restrict__ in0,
-                                 const T *__restrict__ in1, T *out, int size,
-                                 Functor func) {
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
+          typename Functor>
+__global__ void VectorizedKernel(const InT *__restrict__ in0,
+                                 const InT *__restrict__ in1, OutT *out,
+                                 int size, Functor func) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int remain = size - VecSize * tid;
   remain = remain > 0 ? remain : 0;
-  auto data = ElementwiseDataWrapper<ET, VecSize, T>(out, in0, in1);
+  auto data = ElementwiseDataWrapper<ET, VecSize, InT, OutT>(out, in0, in1);
   if (remain >= VecSize) {
     VectorizedKernelImpl(data, func, tid);
   } else {
@@ -163,32 +164,34 @@ __global__ void VectorizedKernel(const T *__restrict__ in0,
   }
 }
 
-template <ElementwiseType ET, typename T, typename Functor>
-__global__ void ScalarKernel(const T *__restrict__ in0,
-                             const T *__restrict__ in1, T *out, int size,
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+__global__ void ScalarKernel(const InT *__restrict__ in0,
+                             const InT *__restrict__ in1, OutT *out, int size,
                              Functor func) {
-  auto data = ElementwiseDataWrapper<ET, 1, T>(out, in0, in1);
+  auto data = ElementwiseDataWrapper<ET, 1, InT, OutT>(out, in0, in1);
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int remain = tid < size ? 1 : 0;
   ScalarKernelImpl(data, func, tid, remain);
 }
 
-template <ElementwiseType ET, typename T, typename Functor>
-void LaunchElementwiseCudaKernel(
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+void LaunchSameDimsElementwiseCudaKernel(
     const platform::CUDADeviceContext &ctx,
     const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, Functor func) {
   // calculate the max vec_size for all ins and outs
   auto size = ins[0]->numel();
-  int vec_size = GetVectorizedSize<T>(ins, *outs);
+  int vec_size = GetVectorizedSize<InT, OutT>(ins, *outs);
   int block_size = ELEMENTWISE_BLOCK_SIZE;
   int grid_size =
       ((size + vec_size - 1) / vec_size + block_size - 1) / block_size;
-  const T *in0 = ins[0]->data<T>();
-  const T *in1 = (ET == ElementwiseType::kBinary) ? ins[1]->data<T>() : nullptr;
-  T *out = (*outs)[0]->data<T>();
+  const InT *in0 = ins[0]->data<InT>();
+  const InT *in1 =
+      (ET == ElementwiseType::kBinary) ? ins[1]->data<InT>() : nullptr;
+  OutT *out = (*outs)[0]->data<OutT>();
   // cuda kernel
   auto stream = ctx.stream();
+
   switch (vec_size) {
     case 4:
       VectorizedKernel<ET, 4><<<grid_size, block_size, 0, stream>>>(
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
index 97cd4d90be689..e9ad2895e03db 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
@@ -173,7 +173,9 @@ void FusedBatchNormActOpMaker::Make() {
       .AddCustomChecker([](const float &epsilon) {
         PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true,
                           platform::errors::InvalidArgument(
-                              "'epsilon' should be between 0.0 and 0.001."));
+                              "Attr(epsilon) should be between 0.0 and 0.001, "
+                              "but received value is %f.",
+                              epsilon));
       });
   AddAttr<std::string>("act_type", "The activation type to be fused.")
       .SetDefault("relu");
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
index b53b407d4995d..4d270280d389c 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
@@ -25,11 +25,13 @@ class EmbeddingEltWiseLayerNormOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* context) const override {
-    PADDLE_ENFORCE_EQ(context->Inputs("Ids").size(),
-                      context->Inputs("Embs").size(),
-                      platform::errors::InvalidArgument(
-                          "Two inputs of EmbeddingEltWiseLayerNormOp shoube be "
-                          "the same size"));
+    PADDLE_ENFORCE_EQ(
+        context->Inputs("Ids").size(), context->Inputs("Embs").size(),
+        platform::errors::InvalidArgument(
+            "Two inputs of EmbeddingEltWiseLayerNormOp shoube be "
+            "the same size, but received the size of input Ids = %d,"
+            " the size of input Embs = %d",
+            context->Inputs("Ids").size(), context->Inputs("Embs").size()));
     PADDLE_ENFORCE_GE(context->Inputs("Embs").size(), 2UL,
                       platform::errors::InvalidArgument(
                           "Input Embs of EmbeddingEltWiseLayerNormOp should "
@@ -77,7 +79,8 @@ class EmbeddingEltWiseLayerNormOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           embs_dims[i][1], hidden,
           platform::errors::InvalidArgument(
-              "The Emb first dim size(%d) shoule equal to hidden (%d).",
+              "The second dimension size(%d) of the Embedding should be "
+              "equal to the hidden's size(%d)",
               embs_dims[i][1], hidden));
     }
 
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
index bd376b1e7aaef..382d01f6a535c 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
@@ -40,7 +40,9 @@ class TransposeFlattenConcatFusionOp : public framework::OperatorWithKernel {
     const size_t n = ins.size();
     PADDLE_ENFORCE_GT(n, 0,
                       platform::errors::InvalidArgument(
-                          "Input tensors dim size should greater than 0."));
+                          "The size of Inputs(X)'s dimension should be greater "
+                          " than 0, but received %d.",
+                          n));
 
     std::vector<int> trans_axis =
         ctx->Attrs().Get<std::vector<int>>("trans_axis");
diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc
index ae3d0f2633bb1..6d1dac8304050 100644
--- a/paddle/fluid/operators/gather_op_xpu.cc
+++ b/paddle/fluid/operators/gather_op_xpu.cc
@@ -40,16 +40,6 @@ class GatherOpXPUKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
-    // check index type is INT32
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "XPU only support INT32, it holds %s, but desires to be %s",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32)));
 
     const auto index_dims = index->dims();
     if (index_dims.size() == 2) {
@@ -65,14 +55,26 @@ class GatherOpXPUKernel : public framework::OpKernel<T> {
               "The index should be 1D, when it is not 2D, but we get %d",
               index_dims.size()));
     }
-    int slice_size = x->numel() / x->dims()[0];
+    std::vector<int> xshape(x->dims().size());
+    for (int i = 0; i < x->dims().size(); ++i) {
+      xshape[i] = x->dims()[i];
+    }
+
     auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
-    int r =
-        xpu::gather<T>(dev_ctx.x_context(), x->data<T>(), index->data<int>(),
-                       index->dims()[0], slice_size, output->data<T>());
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External("XPU kernel error! error code=%d", r));
+    int r = XPU_SUCCESS;
+    if (index->type() == framework::proto::VarType::INT32) {
+      r = xpu::gather<T, int>(dev_ctx.x_context(), x->data<T>(),
+                              index->data<int>(), output->data<T>(), xshape,
+                              index->dims()[0], 0);
+    } else {
+      r = xpu::gather<T, int64_t>(dev_ctx.x_context(), x->data<T>(),
+                                  index->data<int64_t>(), output->data<T>(),
+                                  xshape, index->dims()[0], 0);
+    }
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::External(
+                          "XPU gather kernel return wrong value[%d %s]", r,
+                          XPUAPIErrorMsg[r]));
   }
 };
 
@@ -93,30 +95,11 @@ class GatherGradOpXPUKernel : public framework::OpKernel<T> {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Now, it doesn't support XPU with Axis."));
     }
-
-    dx->mutable_data<T>(ctx.GetPlace());
-    const int zero = 0;
-    int r_dx = xpu::memset(dev_ctx.x_context(), dx->data<T>(), zero,
-                           dx->numel() * sizeof(T));
-    PADDLE_ENFORCE_EQ(
-        r_dx, xpu::Error_t::SUCCESS,
-        platform::errors::External("XPU kernel error! error code=%d", r_dx));
-
     if (dout->numel() == 0) {
       return;
     }
-    bool overwrite = ctx.Attr<bool>("overwrite");
-    // check index type is INT32
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(
-        index_type_match, true,
-        platform::errors::InvalidArgument(
-            "XPU only support INT32, it holds %s, but desires to be %s",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32)));
 
+    bool overwrite = ctx.Attr<bool>("overwrite");
     const auto index_dims = index->dims();
     if (index_dims.size() == 2) {
       PADDLE_ENFORCE_EQ(
@@ -131,16 +114,27 @@ class GatherGradOpXPUKernel : public framework::OpKernel<T> {
               "The index should be 1D, when it is not 2D, but we get %d",
               index_dims.size()));
     }
+    std::vector<int> xshape(dx->dims().size());
+    for (int i = 0; i < dx->dims().size(); ++i) {
+      xshape[i] = dx->dims()[i];
+    }
 
-    int index_size = index_dims[0];
-    int slice_size = dout->numel() / dout->dims()[0];
+    dx->mutable_data<T>(ctx.GetPlace());
 
-    int r = xpu::scatter<T>(dev_ctx.x_context(), dout->data<T>(),
-                            index->data<int>(), index_size, slice_size,
-                            dx->data<T>(), overwrite);
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External("XPU kernel error! error code=%d", r));
+    int r = XPU_SUCCESS;
+    if (index->type() == framework::proto::VarType::INT32) {
+      r = xpu::gather_grad<T, int>(dev_ctx.x_context(), dout->data<T>(),
+                                   index->data<int>(), dx->data<T>(), xshape,
+                                   index->dims()[0], 0, overwrite);
+    } else {
+      r = xpu::gather_grad<T, int64_t>(dev_ctx.x_context(), dout->data<T>(),
+                                       index->data<int64_t>(), dx->data<T>(),
+                                       xshape, index->dims()[0], 0, overwrite);
+    }
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::External(
+                          "XPU gather grad kernel return wrong value[%d %s]", r,
+                          XPUAPIErrorMsg[r]));
   }
 };
 
diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc
index 899025ae7093b..6a195bb9400e8 100644
--- a/paddle/fluid/operators/imag_op.cc
+++ b/paddle/fluid/operators/imag_op.cc
@@ -96,11 +96,11 @@ REGISTER_OPERATOR(imag, ops::ImagOp, ops::ImagOpMaker,
 REGISTER_OPERATOR(imag_grad, ops::ImagGradOp);
 
 REGISTER_OP_CPU_KERNEL(imag, ops::ImagKernel<paddle::platform::CPUDeviceContext,
-                                             paddle::platform::complex64>,
+                                             paddle::platform::complex<float>>,
                        ops::ImagKernel<paddle::platform::CPUDeviceContext,
-                                       paddle::platform::complex128>);
+                                       paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(imag_grad,
                        ops::ImagGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex64>,
+                                           paddle::platform::complex<float>>,
                        ops::ImagGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex128>);
+                                           paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/imag_op.cu b/paddle/fluid/operators/imag_op.cu
index a7a3b13682198..9cfb2ef7f2fef 100644
--- a/paddle/fluid/operators/imag_op.cu
+++ b/paddle/fluid/operators/imag_op.cu
@@ -18,11 +18,11 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(imag,
                         ops::ImagKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
                         ops::ImagKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(imag_grad,
                         ops::ImagGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex64>,
+                                            paddle::platform::complex<float>>,
                         ops::ImagGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex128>);
+                                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index 9574b325ef77f..87618b954d232 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -29,6 +29,11 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
     auto *output_t = ctx.Output<framework::LoDTensor>("Out");  // float tensor
     auto *table_t = ctx.Input<framework::LoDTensor>("W");
 
+    // It seems cann 20.1 accepts int64, but cann 20.2+ not.
+    PADDLE_ENFORCE_EQ(ids_t->type(), framework::proto::VarType::INT32,
+                      platform::errors::Unimplemented(
+                          "The index of LookupTableV2 should be int32."));
+
     auto *table_var = ctx.InputVar("W");
     PADDLE_ENFORCE_EQ(
         table_var->IsType<framework::LoDTensor>(), true,
diff --git a/paddle/fluid/operators/math/complex_functors.h b/paddle/fluid/operators/math/complex_functors.h
index 0e8aed40f6e16..f530256677854 100644
--- a/paddle/fluid/operators/math/complex_functors.h
+++ b/paddle/fluid/operators/math/complex_functors.h
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include <type_traits>
 
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
@@ -66,7 +65,10 @@ using select_t = typename select<Head, Tail...>::type;
 template <typename T>
 using Real =
     select_t<cond<std::is_same<T, platform::complex64>::value, float>,
-             cond<std::is_same<T, platform::complex128>::value, double>, T>;
+             cond<std::is_same<T, platform::complex128>::value, double>,
+             cond<std::is_same<T, platform::complex<float>>::value, float>,
+             cond<std::is_same<T, platform::complex<double>>::value, double>,
+             T>;
 
 template <typename T, typename RealT>
 using Complex = typename std::enable_if<!std::is_same<T, RealT>::value>::type;
@@ -76,14 +78,18 @@ template <typename T, typename RealT>
 using NoComplex = typename std::enable_if<std::is_same<T, RealT>::value>::type;
 
 template <typename T>
-using EnableComplex =
-    typename std::enable_if<std::is_same<T, platform::complex64>::value ||
-                            std::is_same<T, platform::complex128>::value>::type;
+using EnableComplex = typename std::enable_if<
+    std::is_same<T, platform::complex64>::value ||
+    std::is_same<T, platform::complex128>::value ||
+    std::is_same<T, platform::complex<float>>::value ||
+    std::is_same<T, platform::complex<double>>::value>::type;
 
 template <typename T>
 using DisableComplex = typename std::enable_if<
     !std::is_same<T, platform::complex64>::value &&
-    !std::is_same<T, platform::complex128>::value>::type;
+    !std::is_same<T, platform::complex128>::value &&
+    !std::is_same<T, platform::complex<float>>::value &&
+    !std::is_same<T, platform::complex<double>>::value>::type;
 
 template <typename T, typename Enable = void>
 struct RealFunctor;
@@ -173,44 +179,45 @@ struct AbsGradFunctor {
 };
 
 template <>
-struct AbsGradFunctor<paddle::platform::complex64> {
-  AbsGradFunctor(const float* dout, const paddle::platform::complex64* x,
-                 paddle::platform::complex64* output, int64_t numel)
+struct AbsGradFunctor<paddle::platform::complex<float>> {
+  AbsGradFunctor(const float* dout, const paddle::platform::complex<float>* x,
+                 paddle::platform::complex<float>* output, int64_t numel)
       : dout_(dout), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
-    if (x_[idx] == paddle::platform::complex64(0)) {
-      output_[idx] = paddle::platform::complex64(0);
+    if (x_[idx] == paddle::platform::complex<float>(0)) {
+      output_[idx] = paddle::platform::complex<float>(0);
     } else {
-      output_[idx] = paddle::platform::complex64(dout_[idx]) *
-                     (x_[idx] / paddle::platform::complex64(abs(x_[idx])));
+      output_[idx] = paddle::platform::complex<float>(dout_[idx]) *
+                     (x_[idx] / paddle::platform::complex<float>(abs(x_[idx])));
     }
   }
 
   const float* dout_;
-  const paddle::platform::complex64* x_;
-  paddle::platform::complex64* output_;
+  const paddle::platform::complex<float>* x_;
+  paddle::platform::complex<float>* output_;
   int64_t numel_;
 };
 
 template <>
-struct AbsGradFunctor<paddle::platform::complex128> {
-  AbsGradFunctor(const double* dout, const paddle::platform::complex128* x,
-                 paddle::platform::complex128* output, int64_t numel)
+struct AbsGradFunctor<paddle::platform::complex<double>> {
+  AbsGradFunctor(const double* dout, const paddle::platform::complex<double>* x,
+                 paddle::platform::complex<double>* output, int64_t numel)
       : dout_(dout), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
-    if (x_[idx] == paddle::platform::complex128(0)) {
-      output_[idx] = paddle::platform::complex128(0);
+    if (x_[idx] == paddle::platform::complex<double>(0)) {
+      output_[idx] = paddle::platform::complex<double>(0);
     } else {
-      output_[idx] = paddle::platform::complex128(dout_[idx]) *
-                     (x_[idx] / paddle::platform::complex128(abs(x_[idx])));
+      output_[idx] =
+          paddle::platform::complex<double>(dout_[idx]) *
+          (x_[idx] / paddle::platform::complex<double>(abs(x_[idx])));
     }
   }
 
   const double* dout_;
-  const paddle::platform::complex128* x_;
-  paddle::platform::complex128* output_;
+  const paddle::platform::complex<double>* x_;
+  paddle::platform::complex<double>* output_;
   int64_t numel_;
 };
 
@@ -234,46 +241,46 @@ struct AbsGradGradFunctor {
 };
 
 template <>
-struct AbsGradGradFunctor<paddle::platform::complex128> {
-  AbsGradGradFunctor(const paddle::platform::complex128* ddx,
-                     const paddle::platform::complex128* x,
-                     paddle::platform::complex128* output, int64_t numel)
+struct AbsGradGradFunctor<paddle::platform::complex<double>> {
+  AbsGradGradFunctor(const paddle::platform::complex<double>* ddx,
+                     const paddle::platform::complex<double>* x,
+                     paddle::platform::complex<double>* output, int64_t numel)
       : ddx_(ddx), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
-    if (x_[idx] == paddle::platform::complex128(0)) {
-      output_[idx] = paddle::platform::complex128(0);
+    if (x_[idx] == paddle::platform::complex<double>(0)) {
+      output_[idx] = paddle::platform::complex<double>(0);
     } else {
-      output_[idx] = paddle::platform::complex128(ddx_[idx]) * x_[idx] /
-                     paddle::platform::complex128(abs(x_[idx]));
+      output_[idx] = paddle::platform::complex<double>(ddx_[idx]) * x_[idx] /
+                     paddle::platform::complex<double>(abs(x_[idx]));
     }
   }
 
-  const paddle::platform::complex128* ddx_;
-  const paddle::platform::complex128* x_;
-  paddle::platform::complex128* output_;
+  const paddle::platform::complex<double>* ddx_;
+  const paddle::platform::complex<double>* x_;
+  paddle::platform::complex<double>* output_;
   int64_t numel_;
 };
 
 template <>
-struct AbsGradGradFunctor<paddle::platform::complex64> {
-  AbsGradGradFunctor(const paddle::platform::complex64* ddx,
-                     const paddle::platform::complex64* x,
-                     paddle::platform::complex64* output, int64_t numel)
+struct AbsGradGradFunctor<paddle::platform::complex<float>> {
+  AbsGradGradFunctor(const paddle::platform::complex<float>* ddx,
+                     const paddle::platform::complex<float>* x,
+                     paddle::platform::complex<float>* output, int64_t numel)
       : ddx_(ddx), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
-    if (x_[idx] == paddle::platform::complex64(0)) {
-      output_[idx] = paddle::platform::complex64(0);
+    if (x_[idx] == paddle::platform::complex<float>(0)) {
+      output_[idx] = paddle::platform::complex<float>(0);
     } else {
-      output_[idx] = paddle::platform::complex64(ddx_[idx]) * x_[idx] /
-                     paddle::platform::complex64(abs(x_[idx]));
+      output_[idx] = paddle::platform::complex<float>(ddx_[idx]) * x_[idx] /
+                     paddle::platform::complex<float>(abs(x_[idx]));
     }
   }
 
-  const paddle::platform::complex64* ddx_;
-  const paddle::platform::complex64* x_;
-  paddle::platform::complex64* output_;
+  const paddle::platform::complex<float>* ddx_;
+  const paddle::platform::complex<float>* x_;
+  paddle::platform::complex<float>* output_;
   int64_t numel_;
 };
 template <typename T, typename Enable = void>
diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h
index d6ad3aec22b1f..a79a9da0b30f2 100644
--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -65,16 +65,18 @@ class SplitFunctor {
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_ALL_TYPES(macro)            \
-  macro(int);                           \
-  macro(float);                         \
-  macro(double);                        \
-  macro(bool);                          \
-  macro(int64_t);                       \
-  macro(int16_t);                       \
-  macro(uint8_t);                       \
-  macro(int8_t);                        \
-  macro(::paddle::platform::float16);   \
-  macro(::paddle::platform::bfloat16);  \
-  macro(::paddle::platform::complex64); \
+#define FOR_ALL_TYPES(macro)                  \
+  macro(int);                                 \
+  macro(float);                               \
+  macro(double);                              \
+  macro(bool);                                \
+  macro(int64_t);                             \
+  macro(int16_t);                             \
+  macro(uint8_t);                             \
+  macro(int8_t);                              \
+  macro(::paddle::platform::float16);         \
+  macro(::paddle::platform::bfloat16);        \
+  macro(::paddle::platform::complex<float>);  \
+  macro(::paddle::platform::complex<double>); \
+  macro(::paddle::platform::complex64);       \
   macro(::paddle::platform::complex128)
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 0bdc7b6943422..d01a39ecb7c93 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -47,6 +47,10 @@ template struct SetConstant<platform::CPUDeviceContext, bool>;
 template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
 template struct SetConstant<platform::CPUDeviceContext, platform::complex64>;
 template struct SetConstant<platform::CPUDeviceContext, platform::complex128>;
+template struct SetConstant<platform::CPUDeviceContext,
+                            platform::complex<float>>;
+template struct SetConstant<platform::CPUDeviceContext,
+                            platform::complex<double>>;
 
 #ifdef PADDLE_WITH_XPU
 template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
@@ -59,6 +63,10 @@ template struct SetConstant<platform::XPUDeviceContext, int64_t>;
 template struct SetConstant<platform::XPUDeviceContext, bool>;
 template struct SetConstant<platform::XPUDeviceContext, platform::complex64>;
 template struct SetConstant<platform::XPUDeviceContext, platform::complex128>;
+template struct SetConstant<platform::XPUDeviceContext,
+                            platform::complex<float>>;
+template struct SetConstant<platform::XPUDeviceContext,
+                            platform::complex<double>>;
 #endif
 
 #define DEFINE_CPU_TRANS(RANK)                                                \
@@ -74,6 +82,10 @@ template struct SetConstant<platform::XPUDeviceContext, platform::complex128>;
   template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;       \
   template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;       \
   template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext,                       \
+                            platform::complex<float>, RANK>;                  \
+  template struct Transpose<platform::CPUDeviceContext,                       \
+                            platform::complex<double>, RANK>;                 \
   template struct Transpose<platform::CPUDeviceContext, platform::complex64,  \
                             RANK>;                                            \
   template struct Transpose<platform::CPUDeviceContext, platform::complex128, \
@@ -130,6 +142,8 @@ DEFINE_CPU_TRANS_NORMAL(uint8_t);
 DEFINE_CPU_TRANS_NORMAL(int8_t);
 DEFINE_CPU_TRANS_NORMAL(platform::complex64);
 DEFINE_CPU_TRANS_NORMAL(platform::complex128);
+DEFINE_CPU_TRANS_NORMAL(platform::complex<float>);
+DEFINE_CPU_TRANS_NORMAL(platform::complex<double>);
 
 struct TensorSetConstantCPU {
   TensorSetConstantCPU(framework::Tensor* tensor, float value)
@@ -158,6 +172,14 @@ void set_constant_with_place<platform::NPUPlace>(
   PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
 }
 
+template <>
+void set_constant_with_place<platform::NPUPinnedPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index f94c1bf696cda..c5c78c87f7977 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -43,6 +43,10 @@ template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
 template struct SetConstant<platform::CUDADeviceContext, platform::complex64>;
 template struct SetConstant<platform::CUDADeviceContext, platform::complex128>;
+template struct SetConstant<platform::CUDADeviceContext,
+                            platform::complex<float>>;
+template struct SetConstant<platform::CUDADeviceContext,
+                            platform::complex<double>>;
 
 #define DEFINE_GPU_TRANS(RANK)                                             \
   template struct Transpose<platform::CUDADeviceContext, float, RANK>;     \
@@ -52,6 +56,10 @@ template struct SetConstant<platform::CUDADeviceContext, platform::complex128>;
   template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;    \
   template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>;   \
   template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext,                   \
+                            paddle::platform::complex<float>, RANK>;       \
+  template struct Transpose<platform::CUDADeviceContext,                   \
+                            paddle::platform::complex<double>, RANK>;      \
   template struct Transpose<platform::CUDADeviceContext, complex64, RANK>; \
   template struct Transpose<platform::CUDADeviceContext, complex128, RANK>;
 
@@ -145,6 +153,8 @@ DEFINE_GPU_TRANS_NORMAL(uint8_t);
 DEFINE_GPU_TRANS_NORMAL(int8_t);
 DEFINE_GPU_TRANS_NORMAL(complex64);
 DEFINE_GPU_TRANS_NORMAL(complex128);
+DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex<float>);
+DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex<double>);
 
 struct TensorSetConstantGPU {
   TensorSetConstantGPU(const platform::DeviceContext& context,
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index c12aecc9ba516..e226ab5328844 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -825,6 +825,21 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
       context->SetOutputDim(y_grad_name, y_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
index 4ffcbaf55314a..c18b8590db18d 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
@@ -23,91 +23,112 @@ template <typename DeviceContext, typename T>
 class AccuracyNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* pred = ctx.Input<Tensor>("Out");
+    auto* inference = ctx.Input<Tensor>("Out");
     auto* label = ctx.Input<Tensor>("Label");
-    // auto* logits = ctx.Input<Tensor>("Indices");
+    auto* indices = ctx.Input<Tensor>("Indices");
 
-    auto* acc = ctx.Output<Tensor>("Accuracy");
+    auto* accuracy = ctx.Output<Tensor>("Accuracy");
     auto* correct = ctx.Output<Tensor>("Correct");
     auto* total = ctx.Output<Tensor>("Total");
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    // cast pred
-    Tensor tmp_pred(pred->type());
-    tmp_pred.Resize(pred->dims());
-    tmp_pred.mutable_data<int>(ctx.GetPlace());
-    auto runner_cast_pred =
-        NpuOpRunner("Cast", {*pred}, {tmp_pred},
-                    {{"dst_type", static_cast<int>(ACL_INT32)}});
-    runner_cast_pred.Run(stream);
-
-    // cast label
-    Tensor tmp_label(label->type());
-    tmp_label.Resize(label->dims());
-    tmp_label.mutable_data<int>(ctx.GetPlace());
-    auto runner_cast_label =
-        NpuOpRunner("Cast", {*label}, {tmp_label},
-                    {{"dst_type", static_cast<int>(ACL_INT32)}});
-    runner_cast_label.Run(stream);
+    int num_samples = inference->dims()[0];
+    if (num_samples == 0) {
+      return;
+    }
+
+    // cast `indices` or `label` if their type is not consistent
+    Tensor cast_indices(framework::proto::VarType::INT32);
+    Tensor cast_label(framework::proto::VarType::INT32);
+    if (indices->type() != label->type()) {
+      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
+      if (indices->type() != framework::proto::VarType::INT32) {
+        cast_indices.Resize(indices->dims());
+        cast_indices.mutable_data<int>(ctx.GetPlace());
+        auto runner_cast_indices =
+            NpuOpRunner("Cast", {*indices}, {cast_indices},
+                        {{"dst_type", static_cast<int>(dst_dtype)}});
+        runner_cast_indices.Run(stream);
+      } else {
+        cast_indices.ShareDataWith(*indices);
+      }
+      if (label->type() != framework::proto::VarType::INT32) {
+        cast_label.Resize(label->dims());
+        cast_label.mutable_data<int>(ctx.GetPlace());
+        auto runner_cast_label =
+            NpuOpRunner("Cast", {*label}, {cast_label},
+                        {{"dst_type", static_cast<int>(dst_dtype)}});
+        runner_cast_label.Run(stream);
+      } else {
+        cast_label.ShareDataWith(*label);
+      }
+    } else {
+      cast_indices.ShareDataWith(*indices);
+      cast_label.ShareDataWith(*label);
+    }
 
     // equal
-    Tensor tmp_equal(label->type());
-    tmp_equal.Resize(label->dims());
+    Tensor tmp_equal(framework::proto::VarType::BOOL);
+    tmp_equal.Resize(inference->dims());
     tmp_equal.mutable_data<bool>(ctx.GetPlace());
     auto runner_equal =
-        NpuOpRunner("Equal", {tmp_pred, tmp_label}, {tmp_equal}, {});
+        NpuOpRunner("Equal", {cast_indices, cast_label}, {tmp_equal}, {});
     runner_equal.Run(stream);
 
     // cast equal
-    Tensor tmp_equal_cast(label->type());
-    tmp_equal_cast.Resize(label->dims());
+    Tensor tmp_equal_cast(framework::proto::VarType::FP32);
+    tmp_equal_cast.Resize(inference->dims());
     tmp_equal_cast.mutable_data<float>(ctx.GetPlace());
-    auto runner_cast_equal =
-        NpuOpRunner("Cast", {tmp_equal}, {tmp_equal_cast},
-                    {{"dst_type", static_cast<float>(ACL_FLOAT)}});
+    auto runner_cast_equal = NpuOpRunner(
+        "Cast", {tmp_equal}, {tmp_equal_cast},
+        {{"dst_type",
+          static_cast<int>(ConvertToNpuDtype(tmp_equal_cast.type()))}});
     runner_cast_equal.Run(stream);
 
-    // acc
-    acc->mutable_data<float>(ctx.GetPlace());
-    std::vector<int> axes_vec_1;
-    auto runner_acc = NpuOpRunner("ReduceMeanD", {tmp_equal_cast}, {*acc},
-                                  {{"keep_dims", false}, {"axes", axes_vec_1}});
-    runner_acc.Run(stream);
-
-    // correct
-    correct->mutable_data<float>(ctx.GetPlace());
-    std::vector<int> axes_vec_2;
-    auto runner_correct =
-        NpuOpRunner("ReduceSumD", {tmp_equal_cast}, {*correct},
-                    {{"keep_dims", false}, {"axes", axes_vec_2}});
-    runner_correct.Run(stream);
-
-    // ones_tensor
-    Tensor ones_tensor(label->type());
-    ones_tensor.Resize(label->dims());
-    ones_tensor.mutable_data<int>(ctx.GetPlace());
-    auto runner_oneslike =
-        NpuOpRunner("OnesLike", {tmp_label}, {ones_tensor}, {});
-    runner_oneslike.Run(stream);
-
-    // ones_tensor_cast
-    Tensor ones_tensor_cast(label->type());
-    ones_tensor_cast.Resize(label->dims());
-    ones_tensor_cast.mutable_data<float>(ctx.GetPlace());
-    auto runner_ones_cast =
-        NpuOpRunner("Cast", {ones_tensor}, {ones_tensor_cast},
-                    {{"dst_type", static_cast<float>(ACL_FLOAT)}});
-    runner_ones_cast.Run(stream);
-
-    // total
-    total->mutable_data<float>(ctx.GetPlace());
-    std::vector<int> axes_vec_3;
-    auto runner_total =
-        NpuOpRunner("ReduceSumD", {ones_tensor_cast}, {*total},
-                    {{"keep_dims", false}, {"axes", axes_vec_3}});
-    runner_total.Run(stream);
+    // [correct]
+    // reduce_max
+    Tensor tmp_correct_max(framework::proto::VarType::FP32);
+    tmp_correct_max.Resize(framework::make_ddim({num_samples}));
+    tmp_correct_max.mutable_data<float>(ctx.GetPlace());
+    auto runner_reduce_max =
+        NpuOpRunner("ReduceMaxD", {tmp_equal_cast}, {tmp_correct_max},
+                    {{"axes", std::vector<int>{1}}, {"keep_dims", false}});
+    runner_reduce_max.Run(stream);
+
+    // reduce_sum
+    Tensor tmp_correct(framework::proto::VarType::FP32);
+    tmp_correct.Resize(correct->dims());
+    tmp_correct.mutable_data<float>(ctx.GetPlace());
+    auto runner_reduce_sum =
+        NpuOpRunner("ReduceSumD", {tmp_correct_max}, {tmp_correct},
+                    {{"axes", std::vector<int>{0}}, {"keep_dims", false}});
+    runner_reduce_sum.Run(stream);
+
+    // cast to int
+    correct->mutable_data<int>(ctx.GetPlace());
+    auto runner_cast_correct = NpuOpRunner(
+        "Cast", {tmp_correct}, {*correct},
+        {{"dst_type", static_cast<int>(ConvertToNpuDtype(correct->type()))}});
+    runner_cast_correct.Run(stream);
+
+    // [total]
+    total->mutable_data<int>(ctx.GetPlace());
+    FillNpuTensorWithConstant<int>(total, static_cast<int>(num_samples));
+
+    // use `total` of type `float32` for calculating accuracy
+    Tensor tmp_total(framework::proto::VarType::FP32);
+    tmp_total.Resize(total->dims());
+    tmp_total.mutable_data<float>(ctx.GetPlace());
+    FillNpuTensorWithConstant<float>(&tmp_total,
+                                     static_cast<float>(num_samples));
+
+    // [accuracy]
+    accuracy->mutable_data<float>(ctx.GetPlace());
+    auto runner_accuracy =
+        NpuOpRunner("Div", {tmp_correct, tmp_total}, {*accuracy}, {});
+    runner_accuracy.Run(stream);
   }
 };
 
diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
index e2e9d280027b6..b6b0b486bf060 100644
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -14,21 +14,104 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-namespace platform {
-class MKLDNNDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 
 using paddle::framework::Tensor;
 using paddle::platform::MKLDNNDeviceContext;
 
+template <typename T>
+class LRNMKLDNNHandler : public platform::MKLDNNHandlerT<T, mkldnn::lrn_forward,
+                                                         mkldnn::lrn_backward> {
+ public:
+  LRNMKLDNNHandler(const framework::ExecutionContext& ctx,
+                   const MKLDNNDeviceContext& dev_ctx,
+                   const mkldnn::engine mkldnn_engine,
+                   platform::Place cpu_place, const Tensor* input,
+                   const std::string& unique_name)
+
+      : platform::MKLDNNHandlerT<T, mkldnn::lrn_forward, mkldnn::lrn_backward>(
+            dev_ctx, mkldnn_engine, cpu_place,
+            platform::CreateKey(dev_ctx, framework::vectorize(input->dims()),
+                                unique_name)) {
+    if (!this->isCachedNonBlocking()) {
+      const int n = ctx.Attr<int>("n");
+      // MKL-DNN implements LRN in a caffe way:
+      // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html
+      // Where sum of squares is divided by size of normalization window
+      // this is not the case for PaddlePaddle LRN.
+      // Hence we need to compensate for this diffrence by
+      // multipliing alpha by size of window(n)
+      const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
+      const float beta = ctx.Attr<float>("beta");
+      const float k = ctx.Attr<float>("k");
+      bool is_test = ctx.Attr<bool>("is_test");
+
+      auto dims = framework::vectorize(input->dims());
+
+      auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(),
+                                         input->format());
+
+      this->AcquireForwardPrimitiveDescriptorNonBlocking(
+          is_test ? mkldnn::prop_kind::forward_inference
+                  : mkldnn::prop_kind::forward_training,
+          mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k);
+    }
+  }
+
+  LRNMKLDNNHandler(const framework::ExecutionContext& ctx,
+                   const MKLDNNDeviceContext& dev_ctx,
+                   platform::Place cpu_place, const Tensor* in_x,
+                   const Tensor* out_grad, Tensor* in_x_grad,
+                   const std::string& unique_name)
+      : platform::MKLDNNHandlerT<T, mkldnn::lrn_forward, mkldnn::lrn_backward>(
+            dev_ctx, dev_ctx.GetEngine(), cpu_place,
+            platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()),
+                                unique_name)) {
+    if (!this->isBwdCached()) {
+      PADDLE_ENFORCE_EQ(
+          ctx.Attr<bool>("is_test"), false,
+          platform::errors::PreconditionNotMet(
+              "is_test attribute should be set to False in training phase."));
+
+      const int n = ctx.Attr<int>("n");
+      const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
+      const float beta = ctx.Attr<float>("beta");
+      const float k = ctx.Attr<float>("k");
+
+      auto dims = framework::vectorize<int64_t>(in_x->dims());
+
+      auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(),
+                                         in_x->format());
+      auto diff_md = mkldnn::memory::desc(
+          dims, platform::MKLDNNGetDataType<T>(), out_grad->format());
+
+      this->AcquireForwardPrimitiveDescriptorNonBlocking(
+          mkldnn::prop_kind::forward_training,
+          mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k);
+
+      this->AcquireBackwardPrimitiveDescriptorNonBlocking(
+          mkldnn::algorithm::lrn_across_channels, src_md, diff_md, n, alpha,
+          beta, k);
+    }
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWorkspaceMemory(Tensor* workspace) {
+    T* ptr = workspace->mutable_data<T>(
+        this->place_, this->fwd_pd_->workspace_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(),
+                                            ptr, "@wrk_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireBackwardWorkspaceMemory(
+      const Tensor* workspace) {
+    const T* workspace_data = workspace->data<T>();
+    return this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->workspace_desc(),
+        platform::to_void_cast<T>(workspace_data), "@bwd-wrk_mem_p");
+  }
+};
+
 template <typename T>
 class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -48,8 +131,8 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto out = ctx.Output<Tensor>("Out");
     auto mid = ctx.Output<Tensor>("MidOut");
 
-    platform::LRNMKLDNNHandler<T> handler(
-        ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, ctx.OutputName("Out"));
+    LRNMKLDNNHandler<T> handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), x,
+                                ctx.OutputName("Out"));
 
     auto src_memory = handler.AcquireSrcMemory(x);
     auto dst_memory = handler.AcquireDstMemory(out);
@@ -87,34 +170,22 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
                       paddle::platform::errors::PreconditionNotMet(
                           "Operator DNNL LRNGrad must use CPUPlace"));
-    PADDLE_ENFORCE_EQ(
-        ctx.Attr<bool>("is_test"), false,
-        platform::errors::PreconditionNotMet(
-            "is_test attribute should be set to False in training phase."));
 
-    auto x = ctx.Input<Tensor>("X");
+    auto in_x = ctx.Input<Tensor>("X");
     auto mid = ctx.Input<Tensor>("MidOut");
 
     auto out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    const int n = ctx.Attr<int>("n");
-    const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
-    const float beta = ctx.Attr<float>("beta");
-    const float k = ctx.Attr<float>("k");
+    auto in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
 
-    auto dims = paddle::framework::vectorize<int64_t>(x->dims());
+    LRNMKLDNNHandler<T> handler(ctx, dev_ctx, ctx.GetPlace(), in_x, out_grad,
+                                in_x_grad, ctx.InputName("Out"));
 
-    platform::LRNMKLDNNHandler<T> handler(dims, n, alpha, beta, k, x->format(),
-                                          out_grad->format(), dev_ctx,
-                                          ctx.GetPlace(), ctx.InputName("Out"));
-
-    auto src_memory = handler.AcquireSrcMemory(x);
+    auto src_memory = handler.AcquireSrcMemory(in_x);
     auto workspace = handler.AcquireBackwardWorkspaceMemory(mid);
     auto diff_dst_memory = handler.AcquireDiffDstMemory(out_grad);
-    auto diff_src_memory = handler.AcquireDiffSrcMemory(x_grad);
+    auto diff_src_memory = handler.AcquireDiffSrcMemory(in_x_grad);
 
     auto lrn_bwd = handler.AcquireBackwardPrimitive();
 
@@ -125,8 +196,8 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                {MKLDNN_ARG_WORKSPACE, *workspace}});
     astream.wait();
 
-    x_grad->set_layout(framework::DataLayout::kMKLDNN);
-    x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory));
+    in_x_grad->set_layout(framework::DataLayout::kMKLDNN);
+    in_x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory));
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
index 3ef9d88e4e91e..2b3496359b0c6 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace platform {
@@ -37,6 +37,111 @@ using platform::MKLDNNGetDataType;
 using platform::to_void_cast;
 using Tensor = framework::Tensor;
 
+// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
+// Identity op if the tensor is not of rank 3.
+static framework::Tensor FoldOuterDims(const Tensor& input) {
+  auto output = input;
+  auto in_dims = input.dims();
+  if (in_dims.size() == 3) {
+    output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
+  }
+  return output;
+}
+
+// Reshape a rank-3 tensor from P x M x N to M x (P * N).
+// (Warning: This requires transposing data and writes into new memory.)
+// Identity op if the tensor is not of rank 3.
+template <typename T>
+static framework::Tensor FoldFirstAndLastDims(
+    const MKLDNNDeviceContext& dev_ctx, const Tensor* input) {
+  auto input_dims = framework::vectorize(input->dims());
+  if (input_dims.size() != 3) {
+    return *input;
+  }
+
+  framework::Tensor output;
+  output.Resize({input_dims[1], input_dims[0], input_dims[2]});
+
+  auto output_dims = framework::vectorize(output.dims());
+
+  memory::data_type input_type = framework::ToMKLDNNDataType(input->type());
+  std::string key = platform::CreateKey(dev_ctx, input_dims, input->format(),
+                                        input->format(), input_type);
+  platform::ReorderMKLDNNHandler reorder_handler(output_dims, input->type(),
+                                                 input_type, dev_ctx,
+                                                 dev_ctx.GetEngine(), key);
+
+  auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+      memory::format_tag::abc, platform::to_void_cast(input->data<T>()));
+  auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+      &output, memory::format_tag::bac, dev_ctx.GetPlace());
+  auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
+                                                  reorder_dst_memory_p);
+
+  platform::RecordEvent record_reorder("int_reorder",
+                                       platform::EventRole::kUniqueOp);
+
+  auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+  astream.wait();
+
+  output.Resize({input_dims[1], input_dims[0] * input_dims[2]});
+  return output;
+}
+
+template <typename T>
+class MatMulMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::matmul> {
+ public:
+  MatMulMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
+                      const mkldnn::engine engine, platform::Place cpu_place,
+                      Tensor* x, bool trans_x, Tensor* y, bool trans_y,
+                      Tensor* out, float scale, const std::string& uniq_name)
+      : platform::MKLDNNHandlerT<T, dnnl::matmul>(
+            dev_ctx, engine, cpu_place,
+            platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
+                                uniq_name)) {
+    if (!this->isCached()) {
+      auto mat_dim_x = math::CreateMatrixDescriptor(x->dims(), 0, trans_x);
+      auto mat_dim_y = math::CreateMatrixDescriptor(y->dims(), 0, trans_y);
+
+      memory::dim x_bs = mat_dim_x.batch_size_;
+      memory::dim y_bs = mat_dim_y.batch_size_;
+
+      memory::dim out_bs = x_bs || y_bs ? std::max(x_bs, y_bs) : 1;
+      const memory::dim M = mat_dim_x.height_;
+      const memory::dim N = mat_dim_y.width_;
+      const memory::dim K = mat_dim_x.width_;
+
+      memory::dims x_dims = {x_bs > 0 ? x_bs : 1, M, K};
+      memory::dims y_dims = {y_bs > 0 ? y_bs : 1, K, N};
+      memory::dims out_dims = {out_bs, M, N};
+
+      memory::dims x_strides =
+          !trans_x ? memory::dims{M * K, K, 1} : memory::dims{M * K, 1, M};
+
+      memory::dims y_strides =
+          !trans_y ? memory::dims{N * K, N, 1} : memory::dims{N * K, 1, K};
+      memory::dims out_strides = memory::dims{M * N, N, 1};
+
+      auto x_md = memory::desc(x_dims, MKLDNNGetDataType<T>(), x_strides);
+      auto y_md = memory::desc(y_dims, MKLDNNGetDataType<T>(), y_strides);
+      auto out_md = memory::desc(out_dims, MKLDNNGetDataType<T>(), out_strides);
+
+      dnnl::primitive_attr attrs;
+      if (scale != 1.0f) attrs.set_output_scales(0, {scale});
+
+      this->AcquireForwardPrimitiveDescriptor(attrs, x_md, y_md, out_md);
+    }
+  }
+
+  std::shared_ptr<memory> AcquireWeightsMemory(const Tensor* input) {
+    const T* input_data = input->data<T>();
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(),
+                                            to_void_cast<T>(input_data),
+                                            "@weights_mem_p");
+  }
+};
+
 template <typename T>
 constexpr bool IsInt8() {
   return std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
@@ -44,7 +149,7 @@ constexpr bool IsInt8() {
 
 template <typename T>
 constexpr bool IsBfloat16() {
-  return std::is_same<T, paddle::platform::bfloat16>::value;
+  return std::is_same<T, platform::bfloat16>::value;
 }
 
 // Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
@@ -60,6 +165,60 @@ static framework::DDim ColumnMatrixDimsFromVector(
   return y_dim.size() > 1 ? y_dim : framework::make_ddim({y_dim[0], 1});
 }
 
+/**
+ * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor.
+ *
+ * The shape would be [BatchSize, H, W] or [H, W].
+ * If transposed, `H,W` will be swapped.
+ */
+static void ReshapeTensorToMatrixSequence(
+    framework::Tensor* x, const math::MatDescriptor& descriptor) {
+  int64_t h, w;
+  h = descriptor.height_;
+  w = descriptor.width_;
+  if (descriptor.trans_) {
+    std::swap(w, h);
+  }
+  if (descriptor.batch_size_) {
+    x->Resize({descriptor.batch_size_, h, w});
+  } else {
+    x->Resize({h, w});
+  }
+}
+
+/**
+ * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor
+ * Out = matmul(x, y)
+ *
+ * This method will first calculate X,Y matrix sequence, and then calculate
+ * the out shape.
+ *
+ * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2]
+ * The out = [BatchSize, H1, W2]
+ *
+ * If there is no batch size in `X` and `Y`, the out will be [H1, W2]
+ * If any of `X` and `Y` has batch size BatchSize, the out will have the
+ * BatchSize.
+ */
+static void ReshapeXYOutToMatrixSequence(framework::Tensor* x,
+                                         framework::Tensor* y,
+                                         framework::Tensor* out, bool trans_x,
+                                         bool trans_y) {
+  auto x_dim = RowMatrixDimsFromVector(x->dims());
+  auto y_dim = ColumnMatrixDimsFromVector(y->dims());
+  auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x);
+  auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y);
+  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
+    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
+  } else {
+    out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
+                 mat_dim_x.height_, mat_dim_y.width_});
+  }
+
+  ReshapeTensorToMatrixSequence(x, mat_dim_x);
+  ReshapeTensorToMatrixSequence(y, mat_dim_y);
+}
+
 template <typename XT, typename YT, typename OT>
 class MatMulFactory {
  public:
@@ -372,7 +531,7 @@ static void ExecuteMatMul(const ExecutionContext& ctx) {
 template <typename T>
 class DNNLMatMulKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const ExecutionContext& ctx) const override {
     if (ctx.HasAttr("head_number")) {
       PADDLE_ENFORCE_EQ(
           ctx.Attr<int>("head_number"), 1,
@@ -385,6 +544,137 @@ class DNNLMatMulKernel : public framework::OpKernel<T> {
     ExecuteMatMul<T, T>(ctx);
   }
 };
+
+template <typename T>
+class MatMulGradMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const ExecutionContext& ctx) const override {
+    if (ctx.HasAttr("head_number")) {
+      PADDLE_ENFORCE_EQ(
+          ctx.Attr<int>("head_number"), 1,
+          platform::errors::Unimplemented(
+              "DNNL matmul doesn't support multiple heads. Expected "
+              "head_number=1. But received `head_number` is %d",
+              ctx.Attr<int>("head_number")));
+    }
+    RunKernel<T>(ctx);
+  }
+
+ private:
+  void ExecuteMatMulGrad(const ExecutionContext& ctx,
+                         const MKLDNNDeviceContext& dev_ctx,
+                         const mkldnn::engine& engine, Tensor* x, bool trans_x,
+                         bool is_fold_init_dims_x, Tensor* y, bool trans_y,
+                         bool is_fold_init_dims_y, Tensor* out,
+                         int execution_number) const {
+    // gradient is calculated in a different way when broadcasting is used
+    bool need_combine = (x->dims().size() == 3 || y->dims().size() == 3) &&
+                        out->dims().size() == 2;
+
+    Tensor x_combined, y_combined;
+    if (!need_combine) {
+      x_combined = *x;
+      y_combined = *y;
+    } else {
+      x_combined = is_fold_init_dims_x ? FoldOuterDims(*x)
+                                       : FoldFirstAndLastDims<T>(dev_ctx, x);
+      y_combined = is_fold_init_dims_y ? FoldOuterDims(*y)
+                                       : FoldFirstAndLastDims<T>(dev_ctx, y);
+    }
+
+    MatMulMKLDNNHandler<T> handler(
+        dev_ctx, engine, ctx.GetPlace(), &x_combined, trans_x, &y_combined,
+        trans_y, out, ctx.Attr<float>("alpha"),
+        ctx.InputName(framework::GradVarName("Out")) +
+            std::to_string(execution_number));
+
+    const auto src_memory_p = handler.AcquireSrcMemory(&x_combined);
+    const auto weights_memory_p = handler.AcquireWeightsMemory(&y_combined);
+    const auto dst_memory_p = handler.AcquireDstMemory(out);
+
+    auto matmul_p = handler.AcquireForwardPrimitive();
+
+    std::unordered_map<int, dnnl::memory> matmul_args = {
+        {DNNL_ARG_SRC, *src_memory_p},
+        {DNNL_ARG_WEIGHTS, *weights_memory_p},
+        {DNNL_ARG_DST, *dst_memory_p}};
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    matmul_p->execute(astream, matmul_args);
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    out->set_format(platform::GetMKLDNNFormat(dst_memory_p->get_desc().reshape(
+        framework::vectorize<int64_t>(out->dims()))));
+  }
+
+  template <typename Tout = T>
+  void RunKernel(const ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    auto x = *ctx.Input<Tensor>("X");
+    auto y = *ctx.Input<Tensor>("Y");
+    auto dout = *ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    bool transpose_x = ctx.Attr<bool>("transpose_X");
+    bool transpose_y = ctx.Attr<bool>("transpose_Y");
+
+    ReshapeXYOutToMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+    framework::DDim dx_dims;
+    if (dx) {
+      dx_dims = dx->dims();
+      if (dx_dims != x.dims()) {
+        dx->Resize(x.dims());
+      }
+    }
+
+    framework::DDim dy_dims;
+    if (dy) {
+      dy_dims = dy->dims();
+      if (dy_dims != y.dims()) {
+        dy->Resize(y.dims());
+      }
+    }
+
+    if (transpose_x && transpose_y) {
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &y, true, true,
+                              &dout, true, false, dx, 0);
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, true, true,
+                              &x, true, false, dy, 1);
+    } else if (transpose_x) {
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &y, false, false,
+                              &dout, true, false, dx, 0);
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &x, false, false,
+                              &dout, false, true, dy, 1);
+    } else if (transpose_y) {
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, false, false,
+                              &y, false, true, dx, 0);
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, true, true,
+                              &x, false, true, dy, 1);
+    } else {
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, false, false,
+                              &y, true, false, dx, 0);
+      this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &x, true, true,
+                              &dout, false, true, dy, 1);
+    }
+
+    if (dx) {
+      if (dx_dims != x.dims()) {
+        dx->Resize(dx_dims);
+      }
+    }
+    if (dy) {
+      if (dy_dims != y.dims()) {
+        dy->Resize(dy_dims);
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
@@ -394,3 +684,7 @@ REGISTER_OP_KERNEL(matmul, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::DNNLMatMulKernel<paddle::platform::bfloat16>,
                    ops::DNNLMatMulKernel<int8_t>,
                    ops::DNNLMatMulKernel<uint8_t>);
+
+REGISTER_OP_KERNEL(matmul_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::MatMulGradMKLDNNKernel<float>,
+                   ops::MatMulGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index b7bed95b1d335..04e0bcbfc7ce3 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -43,7 +43,7 @@ class PoolingMKLDNNHandler
             platform::CreateKey(dev_ctx, framework::vectorize(input->dims()),
                                 framework::ToMKLDNNDataType(input->type()),
                                 unique_name)) {
-    if (!this->isCached()) {
+    if (!this->isCachedNonBlocking()) {
       PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
                         platform::errors::InvalidArgument(
                             "Wrong layout set for Input tensor."));
@@ -100,11 +100,10 @@ class PoolingMKLDNNHandler
       const auto is_test = ctx.Attr<bool>("is_test");
 
       const auto dt = framework::ToMKLDNNDataType(input->type());
-      const auto fmt = input->format();
 
       const auto exclude_padding = ctx.Attr<bool>("exclusive");
 
-      const auto src_md = mkldnn::memory::desc(src_tz, dt, fmt);
+      const auto src_md = mkldnn::memory::desc(src_tz, dt, input->format());
       /* create memory descriptor for pooling without specified format
        * ('any') which lets a primitive (pooling in this case) choose
        * the memory format preferred for best performance
@@ -124,7 +123,7 @@ class PoolingMKLDNNHandler
 
       ComputeAdaptivePoolParameters(ctx, src_tz, &ksize, &strides);
 
-      this->AcquireForwardPrimitiveDescriptor(
+      this->AcquireForwardPrimitiveDescriptorNonBlocking(
           is_test ? mkldnn::prop_kind::forward_inference
                   : mkldnn::prop_kind::forward_training,
           pooling_type == "max"
@@ -200,6 +199,10 @@ class PoolingMKLDNNHandler
       auto diff_dst_tz =
           paddle::framework::vectorize<int64_t>(out_grad->dims());
 
+      const auto dt = framework::ToMKLDNNDataType(in_x->type());
+      auto src_md = mkldnn::memory::desc(src_tz, dt, in_x->format());
+      auto dst_md =
+          mkldnn::memory::desc(diff_dst_tz, dt, MKLDNNMemoryFormat::any);
       auto diff_dst_md = mkldnn::memory::desc(
           diff_dst_tz, platform::MKLDNNGetDataType<T>(), out_grad->format());
       auto diff_src_md =
@@ -216,7 +219,18 @@ class PoolingMKLDNNHandler
       ComputeAdaptivePoolParameters(ctx, diff_src_tz, &ksize, &strides);
 
       const auto exclude_padding = ctx.Attr<bool>("exclusive");
-      this->AcquireBackwardPrimitiveDescriptor(
+
+      this->AcquireForwardPrimitiveDescriptorNonBlocking(
+          mkldnn::prop_kind::forward_training,
+          pooling_type == "max"
+              ? mkldnn::algorithm::pooling_max
+              : (exclude_padding
+                     ? mkldnn::algorithm::pooling_avg_exclude_padding
+                     : mkldnn::algorithm::pooling_avg_include_padding),
+          src_md, dst_md, strides, ksize, mkldnn_paddings[0],
+          mkldnn_paddings[1]);
+
+      this->AcquireBackwardPrimitiveDescriptorNonBlocking(
           pooling_type == "max"
               ? mkldnn::algorithm::pooling_max
               : (exclude_padding
diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
new file mode 100644
index 0000000000000..e91bbd15cfb7c
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+
+template <typename T>
+class ScaleMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+
+    bool bias_after_scale = ctx.Attr<bool>("bias_after_scale");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto* scale_tensor = ctx.Input<Tensor>("ScaleTensor");
+
+    float scale = (scale_tensor == nullptr) ? ctx.Attr<float>("scale")
+                                            : (float)*(scale_tensor->data<T>());
+    float bias = ctx.Attr<float>("bias");
+
+    // if bias_after_scale == true
+    //   out = scale*X + bias
+    // else
+    //   out = scale*(X + bias) = scale*X + scale*bias
+
+    if (!bias_after_scale) bias *= scale;
+
+    auto x_tz = framework::vectorize<int64_t>(x->dims());
+    bool is_inplaced = x->IsSharedBufferWith(*out);
+
+    platform::ActivationMKLDNNHandler<T> handler(
+        x_tz, mkldnn::algorithm::eltwise_linear, scale, bias, x->format(),
+        dev_ctx, ctx.GetPlace(), ctx.InputName("X"), is_inplaced);
+
+    auto src_memory_p = handler.AcquireSrcMemory(x);
+    auto dst_memory_p = handler.AcquireDstMemory(out);
+    auto activation_p = handler.AcquireForwardPrimitive();
+
+    auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
+    activation_p->execute(astream, {{MKLDNN_ARG_FROM, *src_memory_p},
+                                    {MKLDNN_ARG_TO, *dst_memory_p}});
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    out->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(scale, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ScaleMKLDNNKernel<float>,
+                   ops::ScaleMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index 1138d51139293..1d177e120b59f 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -15,15 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/softmax_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-namespace platform {
-class MKLDNNDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 
@@ -59,7 +50,7 @@ class SoftmaxMKLDNNHandler
                         : platform::CreateKey(
                               dev_ctx, framework::vectorize(input->dims()),
                               uniq_name)) {
-    if (!this->isCached()) {
+    if (!this->isCachedNonBlocking()) {
       PADDLE_ENFORCE_EQ(
           input->dims(), output->dims(),
           platform::errors::InvalidArgument(
@@ -69,27 +60,41 @@ class SoftmaxMKLDNNHandler
       auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType<T>(),
                              input->format());
 
-      this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md,
-                                              axis);
+      this->AcquireForwardPrimitiveDescriptorNonBlocking(
+          prop_kind::forward_scoring, md, axis);
     }
   }
 
-  SoftmaxMKLDNNHandler(const std::vector<int64_t>& dims,
-                       const MKLDNNMemoryFormat fmt,
-                       const MKLDNNMemoryFormat diff_fmt, const int& axis,
-                       const platform::MKLDNNDeviceContext& dev_ctx,
-                       platform::Place cpu_place, const std::string& uniq_name)
+  SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx,
+                       const MKLDNNDeviceContext& dev_ctx,
+                       platform::Place cpu_place, const Tensor* out,
+                       const Tensor* out_grad, Tensor* in_x_grad,
+                       const std::string& unique_name)
       : platform::MKLDNNHandlerT<T, mkldnn::softmax_forward,
                                  mkldnn::softmax_backward>(
             dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(dev_ctx, dims, uniq_name)) {
-    auto data_softmax_md =
-        mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), fmt);
-    auto diff_softmax_md =
-        mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), diff_fmt);
-
-    this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md,
-                                             axis);
+            platform::CreateKey(dev_ctx, framework::vectorize(out->dims()),
+                                unique_name)) {
+    if (!this->isBwdCached()) {
+      PADDLE_ENFORCE_EQ(
+          out_grad->dims(), in_x_grad->dims(),
+          platform::errors::InvalidArgument("The shape of softmax_grad's input "
+                                            "and output must be identical."));
+
+      auto dims = out_grad->dims();  // input and output share the same shape
+      const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
+      auto softmax_tz = framework::vectorize<int64_t>(dims);
+
+      auto data_softmax_md = MKLDNNMemDesc(
+          softmax_tz, platform::MKLDNNGetDataType<T>(), out->format());
+      auto diff_softmax_md = MKLDNNMemDesc(
+          softmax_tz, platform::MKLDNNGetDataType<T>(), out_grad->format());
+
+      this->AcquireForwardPrimitiveDescriptorNonBlocking(
+          prop_kind::forward_scoring, data_softmax_md, axis);
+      this->AcquireBackwardPrimitiveDescriptorNonBlocking(
+          diff_softmax_md, data_softmax_md, axis);
+    }
   }
 };
 
@@ -145,27 +150,15 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
                           "Operator DNNL SoftmaxGrad must use CPUPlace"));
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const Tensor* output = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx =
-        ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
-
-    PADDLE_ENFORCE_EQ(
-        dout->dims(), dx->dims(),
-        platform::errors::InvalidArgument(
-            "The shape of softmax_grad's input and output must be identical."));
-
-    auto dims = dout->dims();  // input and output share the same shape
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
-
-    auto softmax_tz = paddle::framework::vectorize<int64_t>(dims);
+    auto* out_grad = ctx.template Input<Tensor>(framework::GradVarName("Out"));
+    auto* in_x_grad = ctx.template Output<Tensor>(framework::GradVarName("X"));
 
-    SoftmaxMKLDNNHandler<T> handler(softmax_tz, output->format(),
-                                    dout->format(), axis, dev_ctx,
-                                    ctx.GetPlace(), ctx.InputName("Out"));
+    SoftmaxMKLDNNHandler<T> handler(ctx, dev_ctx, ctx.GetPlace(), output,
+                                    out_grad, in_x_grad, ctx.InputName("Out"));
 
     auto dst_memory_p = handler.AcquireDstMemory(output);
-    auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout);
-    auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx);
+    auto diff_dst_memory_p = handler.AcquireDiffDstMemory(out_grad);
+    auto diff_src_memory_p = handler.AcquireDiffSrcMemory(in_x_grad);
 
     auto softmax_bwd_p = handler.AcquireBackwardPrimitive();
 
@@ -176,8 +169,8 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
                             {MKLDNN_ARG_DIFF_SRC, *diff_src_memory_p}});
     astream.wait();
 
-    dx->set_layout(framework::DataLayout::kMKLDNN);
-    dx->set_format(dout->format());
+    in_x_grad->set_layout(framework::DataLayout::kMKLDNN);
+    in_x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory_p));
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h
index cfc933c7a76fa..79d77235b7c81 100644
--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <vector>
 
 #include "acl/acl.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 
 namespace paddle {
@@ -30,6 +31,7 @@ using Tensor = framework::Tensor;
 using DataLayout = framework::DataLayout;
 using NPUAttribute = framework::NPUAttribute;
 using NPUAttributeMap = framework::NPUAttributeMap;
+using DeviceContextPool = platform::DeviceContextPool;
 
 class NpuOpRunner {
  public:
@@ -90,41 +92,42 @@ aclrtStream GetCurrentNPUStream(int device_id = -1);
 
 template <typename T>
 void FillNpuTensorWithConstant(Tensor *tensor, T val) {
-  // NOTE(zhiqiu): we found that power sometimes returns 0 when val is small
-  // like 1e-8.
-  constexpr float MIN_PRECISION_FOR_POWER = 1e-3;
   PADDLE_ENFORCE_EQ(
       tensor->IsInitialized(), true,
       platform::errors::InvalidArgument("The tensor should be initialized."));
   PADDLE_ENFORCE_EQ(
       platform::is_npu_place(tensor->place()), true,
       platform::errors::InvalidArgument("The tensor should be on NPUPlace."));
-  // do async for better performance
-  if ((typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) &&
-      static_cast<float>(val) > MIN_PRECISION_FOR_POWER) {
-    Tensor tmp(tensor->type());
-    tmp.Resize(tensor->dims());
-    tmp.mutable_data<T>(tensor->place());
-    auto stream = GetCurrentNPUStream(
-        BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device);
-    platform::NPUMemsetAsync(tmp.data<void>(), 0, tmp.numel() * sizeof(T),
-                             stream);
-    auto runner = NpuOpRunner("Power", {tmp}, {*tensor},
-                              {{"power", static_cast<float>(1)},
-                               {"scale", static_cast<float>(0)},
-                               {"shift", static_cast<float>(val)}});
-    runner.Run(stream);
-  } else {
-    T *array = new T[tensor->numel()];
-    for (unsigned int i = 0; i < tensor->numel(); ++i) {
-      array[i] = static_cast<T>(val);
-    }
-    std::vector<T> vec(tensor->numel(), static_cast<T>(val));
-    // do sync copy
+
+  int numel = tensor->numel();
+  if (numel == 1) {
+    Tensor npu_pinned_tensor(tensor->type());
+    platform::NPUPinnedPlace npu_pinned_place;
+    auto npu_pinned_ptr =
+        npu_pinned_tensor.mutable_data<T>({1}, npu_pinned_place);
+    *npu_pinned_ptr = val;
+
     memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
-                 tensor->data<void>(), platform::CPUPlace(), array,
-                 tensor->numel() * sizeof(T), nullptr);
-    delete[] array;
+                 tensor->data<void>(), npu_pinned_place, npu_pinned_ptr,
+                 sizeof(T), GetCurrentNPUStream());
+
+    auto npu_pinned_allocator =
+        static_cast<paddle::memory::allocation::NPUPinnedAllocator *>(
+            paddle::memory::allocation::AllocatorFacade::Instance()
+                .GetAllocator(npu_pinned_place)
+                .get());
+    paddle::memory::allocation::Allocation *allocation =
+        npu_pinned_tensor.Holder().get();
+
+    npu_pinned_allocator->RecordEvent(allocation, GetCurrentNPUStream());
+  } else {
+    std::vector<T> vec(numel, static_cast<T>(val));
+    auto device_id = platform::GetCurrentNPUDeviceId();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
+        pool.Get(platform::NPUPlace(device_id)));
+
+    paddle::framework::TensorFromVector<T>(vec, *dev_ctx, tensor);
   }
 }
 
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
index a7886cdd670d4..7536654c5f5cc 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -198,6 +198,13 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) "
                   "Whether to use multi-precision during weight updating.")
         .SetDefault(false);
+    // TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
+    // as dispensable since they are not used when use_global_beta_pow is true.
+    AddAttr<bool>("use_global_beta_pow",
+                  "(bool, default false) "
+                  "Whether to use global beta_pow for whole model instead of "
+                  "creating beta_pow for each parameter.")
+        .SetDefault(false);
 
     AddComment(R"DOC(
 Adam Optimizer.
@@ -246,4 +253,16 @@ REGISTER_OP_VERSION(adam)
             "EpsilonTensor",
             "If provided, Adam will use this as epsilon, "
             "this has a higher priority than attr(epsilon). "
-            "For better performance in npu kernel. "));
+            "For better performance in npu kernel. "))
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade adam, add 1 attribute [use_global_beta_pow].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "use_global_beta_pow",
+            "If true, Adam will use global beta_pow for whole model "
+            "instead of creating beta_pow for each parameter."
+            "In that case, the outputs(Beta1PowOut, Beta2PowOut) will not be "
+            "used in adam op, "
+            "and beta_pow will be updated after all adam op in the model.",
+            false));
diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu
index 3d6f0f99a52df..2ee2a08bf3bc6 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -154,6 +154,8 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
     int64_t min_row_size_to_use_multithread =
         ctx.Attr<int64_t>("min_row_size_to_use_multithread");
     bool lazy_mode = ctx.Attr<bool>("lazy_mode");
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
 
     auto* param = ctx.Input<LoDTensor>("Param");
     auto* grad_var = ctx.InputVar("Grad");
@@ -254,11 +256,13 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
             lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
             param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
             master_out_data, param->numel());
-        // Cpu update
-        beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
-            beta1 * beta1_pow->data<MPDType>()[0];
-        beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
-            beta2 * beta2_pow->data<MPDType>()[0];
+        if (!use_global_beta_pow) {
+          // Cpu update
+          beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta1 * beta1_pow->data<MPDType>()[0];
+          beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta2 * beta2_pow->data<MPDType>()[0];
+        }
       } else {
         AdamKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
             beta1, beta2, epsilon, beta1_pow->data<MPDType>(),
@@ -269,14 +273,15 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
             lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
             param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
             master_out_data, param->numel());
-        // Update with gpu
-        UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
-            beta1, beta2, beta1_pow->data<MPDType>(),
-            beta2_pow->data<MPDType>(),
-            beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
-            beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
+        if (!use_global_beta_pow) {
+          // Update with gpu
+          UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
+              beta1, beta2, beta1_pow->data<MPDType>(),
+              beta2_pow->data<MPDType>(),
+              beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
+              beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
+        }
       }
-
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto* grad = ctx.Input<framework::SelectedRows>("Grad");
       if (grad->rows().size() == 0) {
@@ -328,11 +333,13 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
             param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
             master_out_data, rows, row_numel, grad_merge.rows().size(),
             lazy_mode, ndim);
-        // Update with cpu
-        beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
-            beta1 * beta1_pow->data<MPDType>()[0];
-        beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
-            beta2 * beta2_pow->data<MPDType>()[0];
+        if (!use_global_beta_pow) {
+          // Update with cpu
+          beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta1 * beta1_pow->data<MPDType>()[0];
+          beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta2 * beta2_pow->data<MPDType>()[0];
+        }
       } else {
         SparseAdamFunctor<T, GPUAdam, MPDType> functor(
             beta1, beta2, epsilon, beta1_pow->data<MPDType>(),
@@ -351,12 +358,14 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
                 ctx.device_context()),
             param->numel());
         for_range(functor);
-        // update beta1 and beta2
-        UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
-            beta1, beta2, beta1_pow->data<MPDType>(),
-            beta2_pow->data<MPDType>(),
-            beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
-            beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
+        if (!use_global_beta_pow) {
+          // update beta1 and beta2
+          UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
+              beta1, beta2, beta1_pow->data<MPDType>(),
+              beta2_pow->data<MPDType>(),
+              beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
+              beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
+        }
       }
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 9667db8055b90..bbd4179d84d89 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -406,6 +406,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
     int64_t min_row_size_to_use_multithread =
         ctx.Attr<int64_t>("min_row_size_to_use_multithread");
     bool lazy_mode = ctx.Attr<bool>("lazy_mode");
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
 
     auto* param = ctx.Input<LoDTensor>("Param");
     auto* grad_var = ctx.InputVar("Grad");
@@ -475,11 +477,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
           lr->data<T>(), grad->data<T>(), param->data<T>(),
           param_out->mutable_data<T>(ctx.GetPlace()));
       functor(param->numel());
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-          beta1 * beta1_pow->data<T>()[0];
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-          beta2 * beta2_pow->data<T>()[0];
-
+      if (!use_global_beta_pow) {
+        beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+            beta1 * beta1_pow->data<T>()[0];
+        beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+            beta2 * beta2_pow->data<T>()[0];
+      }
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto* grad = ctx.Input<framework::SelectedRows>("Grad");
       if (grad->rows().size() == 0) {
@@ -523,10 +526,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
           param_out->mutable_data<T>(ctx.GetPlace()), rows, row_numel,
           grad_merge.rows().size(), lazy_mode);
       // update beta1 and beta2
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-          beta1 * beta1_pow->data<T>()[0];
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-          beta2 * beta2_pow->data<T>()[0];
+      if (!use_global_beta_pow) {
+        beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+            beta1 * beta1_pow->data<T>()[0];
+        beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+            beta2 * beta2_pow->data<T>()[0];
+      }
       if (lazy_mode) {
         VLOG(3) << "run cpu lazy mode";
         size_t row_count = grad_merge.rows().size();
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
index 343a670438862..e5fe7f20a42e0 100644
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -36,7 +36,6 @@ class AdamNPUKernel : public framework::OpKernel<T> {
                           "but the received is %s",
                           ctx.InputNames("Param").front(),
                           framework::ToTypeName(param_var->Type())));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
     auto* param = ctx.Input<LoDTensor>("Param");
     auto* grad_var = ctx.InputVar("Grad");
     PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
@@ -50,8 +49,8 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     auto* mom2 = ctx.Input<LoDTensor>("Moment2");
     auto* lr = ctx.Input<LoDTensor>("LearningRate");
 
-    auto* beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
-    auto* beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
+    auto* beta1_pow = ctx.Input<Tensor>("Beta1Pow");
+    auto* beta2_pow = ctx.Input<Tensor>("Beta2Pow");
 
     auto* param_out = ctx.Output<LoDTensor>("ParamOut");
     auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
@@ -59,25 +58,28 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
     auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
 
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
     param_out->mutable_data<T>(ctx.GetPlace());
     mom1_out->mutable_data<T>(ctx.GetPlace());
     mom2_out->mutable_data<T>(ctx.GetPlace());
 
-    // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place.
+    // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform
+    // place.
+    LoDTensor beta1_pow_tmp;
+    LoDTensor beta2_pow_tmp;
     if (beta1_pow->place() == platform::CPUPlace()) {
       T beta1 = *beta1_pow->data<T>();
-      // `mutable_data` operation needs to be done after getting data
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(beta1_pow_out, beta1);
-    } else {
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+      beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&beta1_pow_tmp, beta1);
+      beta1_pow = &beta1_pow_tmp;
     }
     if (beta2_pow->place() == platform::CPUPlace()) {
       T beta2 = *beta2_pow->data<T>();
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(beta2_pow_out, beta2);
-    } else {
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+      beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&beta2_pow_tmp, beta2);
+      beta2_pow = &beta2_pow_tmp;
     }
 
     const Tensor* beta1_tensor = nullptr;
@@ -174,12 +176,16 @@ class AdamNPUKernel : public framework::OpKernel<T> {
           *mom2, ctx.GetPlace(),
           ctx.template device_context<platform::DeviceContext>(), mom2_out);
     }
-    auto runner_m1 =
-        NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
-    runner_m1.Run(stream);
-    auto runner_m2 =
-        NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
-    runner_m2.Run(stream);
+    if (!use_global_beta_pow) {
+      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+      auto runner_m1 =
+          NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
+      runner_m1.Run(stream);
+      auto runner_m2 =
+          NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
+      runner_m2.Run(stream);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
index 09f117374499b..0f5706e428e15 100644
--- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
@@ -73,6 +73,9 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
                           "value is:%d.",
                           beta2_pow_out->numel()));
 
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
     T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
     if (ctx.HasInput("Beta1Tensor")) {
       auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
@@ -111,45 +114,48 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
           mom1_out.template mutable_data<T>(ctx.GetPlace()),
           mom2_out.template mutable_data<T>(ctx.GetPlace()),
           param_out.template mutable_data<T>(ctx.GetPlace()), param.numel());
-
-      // update in cpu and then copy to xpu
-      if (beta1_pow.place() == platform::CPUPlace() &&
-          beta2_pow.place() == platform::CPUPlace()) {
-        const T* beta1_pow_p = beta1_pow.template data<T>();
-        beta1_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
-            beta1 * beta1_pow_p[0];
-        const T* beta2_pow_p = beta2_pow.template data<T>();
-        beta2_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
-            beta2 * beta2_pow_p[0];
-      } else {
-        T cpu_beta1_pow_out_data;
-        T cpu_beta2_pow_out_data;
-        memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data,
-                     BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
-                     beta1_pow_ptr, sizeof(T));
-
-        cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
-        memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
-                     BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()),
-                     beta2_pow_ptr, sizeof(T));
-
-        cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;
-
-        T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
-        T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
-                     beta1_pow_out_p, platform::CPUPlace(),
-                     &cpu_beta1_pow_out_data, sizeof(T));
-        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
-                     beta2_pow_out_p, platform::CPUPlace(),
-                     &cpu_beta2_pow_out_data, sizeof(T));
+      if (!use_global_beta_pow) {
+        // update in cpu and then copy to xpu
+        if (beta1_pow.place() == platform::CPUPlace() &&
+            beta2_pow.place() == platform::CPUPlace()) {
+          const T* beta1_pow_p = beta1_pow.template data<T>();
+          beta1_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
+              beta1 * beta1_pow_p[0];
+          const T* beta2_pow_p = beta2_pow.template data<T>();
+          beta2_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
+              beta2 * beta2_pow_p[0];
+
+        } else {
+          T cpu_beta1_pow_out_data;
+          T cpu_beta2_pow_out_data;
+
+          memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data,
+                       BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
+                       beta1_pow_ptr, sizeof(T));
+
+          cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
+          memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
+                       BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()),
+                       beta2_pow_ptr, sizeof(T));
+
+          cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;
+
+          T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+          T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+          memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                       beta1_pow_out_p, platform::CPUPlace(),
+                       &cpu_beta1_pow_out_data, sizeof(T));
+          memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                       beta2_pow_out_p, platform::CPUPlace(),
+                       &cpu_beta2_pow_out_data, sizeof(T));
+        }
+
+        PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
+                          platform::errors::External(
+                              "XPU API return wrong value[%d], please check "
+                              "where Baidu Kunlun Card is properly installed.",
+                              r));
       }
-
-      PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
-                        platform::errors::External(
-                            "XPU API return wrong value[%d], please check "
-                            "where Baidu Kunlun Card is properly installed.",
-                            r));
     } else {
       PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument(
                                   "Variable type not supported by adam_op"));
diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc
index 1d072936f409c..df2eb70b144e4 100644
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/service/heter_client.h"
 #include "paddle/fluid/distributed/service/heter_server.h"
+#include "paddle/fluid/framework/op_registry.h"
+
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace distributed = paddle::distributed;
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index ee111a0ec7c09..0ebfb2f1bcd22 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -59,16 +59,6 @@ HOSTDEVICE inline void StridedMemcpy(const T* x, const size_t* x_dims, T* out,
   size_t offset_i = offsets[i];
 
   if (i == rank - 1) {
-    PADDLE_ENFORCE(x_stride == 1,
-                   "When i:%d == rank:%d - 1, x_stride of random_crop_op "
-                   "expected to be 1, but got %ld. Please check input "
-                   "value.",
-                   i, rank, x_stride);
-    PADDLE_ENFORCE(out_stride == 1,
-                   "When i:%d == rank:%d - 1, out_stride of random_crop_op "
-                   "expected to be 1, but got %ld. Please check input "
-                   "value.",
-                   i, rank, out_stride);
     x += offset_i;
     for (size_t j = 0; j < out_dim_i; ++j) {
       *out++ = *x++;
diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc
index 5f667999ee613..1174e72a76b1b 100644
--- a/paddle/fluid/operators/real_op.cc
+++ b/paddle/fluid/operators/real_op.cc
@@ -95,11 +95,11 @@ REGISTER_OPERATOR(real, ops::RealOp, ops::RealOpMaker,
 REGISTER_OPERATOR(real_grad, ops::RealGradOp);
 
 REGISTER_OP_CPU_KERNEL(real, ops::RealKernel<paddle::platform::CPUDeviceContext,
-                                             paddle::platform::complex64>,
+                                             paddle::platform::complex<float>>,
                        ops::RealKernel<paddle::platform::CPUDeviceContext,
-                                       paddle::platform::complex128>);
+                                       paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(real_grad,
                        ops::RealGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex64>,
+                                           paddle::platform::complex<float>>,
                        ops::RealGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex128>);
+                                           paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/real_op.cu b/paddle/fluid/operators/real_op.cu
index b3d0855111b72..9bfb2878a6261 100644
--- a/paddle/fluid/operators/real_op.cu
+++ b/paddle/fluid/operators/real_op.cu
@@ -18,11 +18,11 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(real,
                         ops::RealKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
                         ops::RealKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(real_grad,
                         ops::RealGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex64>,
+                                            paddle::platform::complex<float>>,
                         ops::RealGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex128>);
+                                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
new file mode 100644
index 0000000000000..9cc8ac200b8ee
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class XPULogsumexpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    auto axis = context.Attr<std::vector<int>>("axis");
+    auto reduce_all = context.Attr<bool>("reduce_all");
+
+    const auto& input_dim_size = input->dims().size();
+    // The dims has full dim, set the reduce_all is True
+    reduce_all |= (static_cast<const int>(axis.size()) == input_dim_size);
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    std::vector<int> axis_shape;
+    std::vector<int> xdims(input_dim_size);
+    for (int i = 0; i < input_dim_size; ++i) {
+      xdims[i] = input->dims()[i];
+    }
+    if (reduce_all) {
+      for (int i = 0; i < input_dim_size; ++i) {
+        axis_shape.push_back(i);
+      }
+    } else {
+      for (size_t i = 0; i < axis.size(); ++i) {
+        int rdim = axis[i] < 0 ? axis[i] + input_dim_size : axis[i];
+        axis_shape.push_back(rdim);
+      }
+    }
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::logsumexp<T>(dev_ctx.x_context(), input_data, output_data,
+                              xdims, axis_shape);
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::External(
+                          "XPU logsumexp kernel error! error value[%d %]", r,
+                          XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    logsumexp,
+    ops::XPULogsumexpKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
new file mode 100644
index 0000000000000..f4ea18edb2a95
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct CustomMin {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return (b < a) ? b : a;
+  }
+};
+
+template <typename T>
+struct CustomMax {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return (b > a) ? b : a;
+  }
+};
+
+template <typename T>
+struct CustomSum {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return b + a;
+  }
+};
+
+template <typename T>
+struct CustomMul {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return b * a;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index fdb2c57385b2b..c8d568c8c2cf7 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -100,6 +100,8 @@ REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp,
                   ops::ReduceMeanDoubleGradOpBaseMaker,
                   ops::ReduceMeanGradNoNeedBufferVarInferer);
 REGISTER_OP_CPU_KERNEL(reduce_mean,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         bool, ops::MeanFunctor>,
                        ops::ReduceKernel<paddle::platform::CPUDeviceContext,
                                          float, ops::MeanFunctor>,
                        ops::ReduceKernel<paddle::platform::CPUDeviceContext,
@@ -110,5 +112,6 @@ using CPUReduceMeanGradKernel =
     ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, T,
                           ops::MeanGradFunctor, true>;
 
-REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel<float>,
+REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel<bool>,
+                       CPUReduceMeanGradKernel<float>,
                        CPUReduceMeanGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
index cc3653fcb43a4..50d2fcdee23bd 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
@@ -65,5 +65,6 @@ class ReduceMeanKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
+REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<bool>,
+                        ops::ReduceMeanKernel<float>,
                         ops::ReduceMeanKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
index 289f574719ff0..0e133d5447f93 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
@@ -20,5 +20,6 @@ using CUDAReduceMeanGradKernel =
     ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, T,
                           ops::MeanGradFunctor, true>;
 
-REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<float>,
+REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<bool>,
+                        CUDAReduceMeanGradKernel<float>,
                         CUDAReduceMeanGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
new file mode 100644
index 0000000000000..91d7fb7c8439a
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
@@ -0,0 +1,646 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <set>
+#include <vector>
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/framework/array.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+// Post processing function for sum, max, min, prod, any
+template <typename T>
+struct IdentityFunctor {
+  DEVICE explicit inline IdentityFunctor() {}
+
+  DEVICE inline T operator()(const T& x) const { return x; }
+};
+
+// Post processing function for mean
+template <typename T>
+struct DivideFunctor {
+  DEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
+
+  DEVICE inline T operator()(const T& x) const { return x * n_inv; }
+
+ private:
+  T n_inv;
+};
+
+static inline int GetLastPow2(int n) {
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+  return std::max(1, n - (n >> 1));
+}
+
+static inline std::vector<int> GetStrides(const std::vector<int>& dims,
+                                          const std::vector<int>& idx) {
+  int n = static_cast<int>(idx.size());
+  if (n == 0) return std::vector<int>();
+  std::vector<int> strides(n);
+  strides.back() = 1;
+  for (int i = n - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[idx[i + 1]];
+  }
+  return strides;
+}
+
+#ifdef __HIPCC__
+constexpr int kMaxBlockDim = 256;
+#else
+constexpr int kMaxBlockDim = 512;
+#endif
+
+static inline int GetDesiredBlockDim(int block_dim) {
+  return block_dim >= kMaxBlockDim
+             ? kMaxBlockDim
+             : (1 << static_cast<int>(std::log2(block_dim)));
+}
+
+static inline void CheckReduceRankIsValid(int reduce_rank, int rank) {
+  if (rank % 2 == 0) {
+    PADDLE_ENFORCE_EQ(reduce_rank, rank / 2,
+                      platform::errors::InvalidArgument(
+                          "ReduceOp: invalid reduce rank. When rank = %d, "
+                          "reduce_rank must be %d, but got %d.",
+                          rank, rank / 2, reduce_rank));
+  } else {
+    auto lower_rank = (rank - 1) / 2;
+    auto upper_rank = (rank + 1) / 2;
+    PADDLE_ENFORCE_EQ(
+        reduce_rank == lower_rank || reduce_rank == upper_rank, true,
+        platform::errors::InvalidArgument(
+            "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank "
+            "must be %d or %d, but got %d.",
+            rank, lower_rank, upper_rank, reduce_rank));
+  }
+}
+
+template <typename T, size_t ElementCount, typename VectorLikeType>
+static inline paddle::framework::Array<T, ElementCount> from(
+    const VectorLikeType& vec) {
+  PADDLE_ENFORCE_EQ(vec.size(), ElementCount,
+                    platform::errors::InvalidArgument(
+                        "Cub reduce Array: size not match. Received "
+                        "vec.size() %d !=  ElementCount %d.",
+                        vec.size(), ElementCount));
+  size_t n = static_cast<size_t>(vec.size());
+  paddle::framework::Array<T, ElementCount> ret;
+  for (size_t i = 0; i < n; ++i) ret[i] = vec[i];
+  return ret;
+}
+
+}  // namespace detail
+
+enum ReduceType {
+  kReduceAll = 0x00,
+  kReduceLastDim = 0x01,
+  kReduceHigherDim = 0x02,  // ReduceFirstDim or reduceSecondDim
+  kReduceAny = 0x03,
+};
+
+// reduce config
+template <typename Ty>
+struct ReduceConfig {
+  ReduceConfig(std::vector<int> origin_reduce_dims, std::vector<int> x_dim)
+      : reduce_dims_origin(origin_reduce_dims), x_dim(x_dim) {}
+
+  // get the parameters of reduceKernel
+  void Run() {
+    // step1: update the reduce_dim left_dim and x_dim
+    SetReduceDim();
+    // step2: get the strides of dim for reduceAny and reduceLastDim
+    SetStrides();
+    // step3: get the type of reduce
+    SetReduceType();
+    // step4: set the block and grid for launch kernel
+    SetBlockDim();
+  }
+
+  // when should_reduce_again is true, we need malloc temp space for temp data
+  void SetOutputData(Ty* y_data, const platform::Place& place,
+                     framework::Tensor& tmp) {
+    if (should_reduce_again) {
+      output_data = tmp.mutable_data<Ty>(
+          framework::make_ddim(
+              {static_cast<int64_t>(left_num * grid.y * sizeof(Ty))}),
+          place);
+    } else {
+      output_data = y_data;
+    }
+  }
+
+ private:
+  // set reduce_dim, left_dim and update x_dim
+  // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1]
+  //     --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1]
+  void SetReduceDim() {
+    std::set<int> reduce_set;
+
+    for (auto e : reduce_dims_origin) {
+      auto pos = e >= 0 ? e : e + x_dim.size();
+      reduce_set.insert(pos);
+    }
+    std::vector<int> reduce_dim_temp(reduce_set.begin(), reduce_set.end());
+    std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end());
+    // get reduce_dim
+    if (reduce_dim_temp.size() > 1) {
+      int num = 0;  // for update axis
+      reduce_dim.push_back(reduce_dim_temp[0]);
+      for (int idx = 1; idx < reduce_dim_temp.size(); idx++) {
+        // update x_dim
+        if (reduce_dim_temp[idx] - reduce_dim_temp[idx - 1] == 1) {
+          x_dim[reduce_dim_temp[idx - 1]] *= x_dim[reduce_dim_temp[idx]];
+          x_dim.erase(x_dim.begin() + reduce_dim_temp[idx]);
+          num++;
+        } else {
+          reduce_dim.push_back(reduce_dim_temp[idx] - num);
+        }
+      }
+    } else {
+      reduce_dim = reduce_dim_temp;
+    }
+
+    // update new_x_dim and new_reduce_dim
+    std::vector<int> new_x_dim, new_reduce_dim_temp;
+    int is_reduced = 0;
+    for (auto e : reduce_dim) {
+      is_reduced |= 1 << e;
+    }
+
+    for (int i = 0; i < x_dim.size(); i++) {
+      if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
+        new_x_dim.push_back(x_dim[i]);
+        if ((is_reduced >> i) & 1)
+          new_reduce_dim_temp.push_back(new_x_dim.size() - 1);
+      } else {
+        new_x_dim[new_x_dim.size() - 1] *= x_dim[i];
+      }
+    }
+
+    x_dim = new_x_dim;
+    reduce_dim = new_reduce_dim_temp;
+
+    int x_rank = static_cast<int>(x_dim.size());
+    std::set<int> left_set;
+
+    for (int i = 0; i < x_rank; ++i) {
+      left_set.insert(i);
+    }
+
+    for (auto e : reduce_dim) {
+      left_set.erase(e);
+    }
+
+    left_dim.assign(left_set.begin(), left_set.end());
+  }
+
+  // set x_strides, reduce_strides, left_strides for reduceLastDim and reduceAny
+  // eg: x_dim = [8, 6], reduce_dim = [0], left_dim = [1]
+  //     --SetStrides--> x_strides= [6,1], reduce_strides = [1],
+  //     left_strides = [1]
+  void SetStrides() {
+    std::vector<int> idx_dim;
+    for (int i = 0; i < x_dim.size(); i++) {
+      idx_dim.push_back(i);
+    }
+
+    x_strides = detail::GetStrides(x_dim, idx_dim);
+    reduce_strides = detail::GetStrides(x_dim, reduce_dim);
+    left_strides = detail::GetStrides(x_dim, left_dim);
+    reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
+
+    left_num = 1;
+    if (left_dim.size()) {
+      left_num = left_strides[0] * x_dim[left_dim[0]];
+    }
+  }
+
+  // get the reduceType
+  // eg: x_dim = [8, 6] reduce_dim = [0] --> ReduceHigherDim -->reduceFirstDim
+  //     x_dim = [8, 6] reduce_dim = [1] --> reduceLastDim
+  //     x_dim = [8] reduce_dim = [0] --> reduceAll
+  //     x_dim = [8, 6, 4, 2] reduce_dim = [0, 2] --> reduceAny
+  void SetReduceType() {
+    int rank = x_dim.size();
+    int reduce_rank = reduce_dim.size();
+
+    if (rank == reduce_rank) {
+      reduce_type = static_cast<int>(ReduceType::kReduceAll);
+
+    } else if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
+      reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
+    } else if (reduce_rank == 1) {
+      // ReduceFirstDim and reduceSecondDim
+      reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
+
+    } else {
+      reduce_type = static_cast<int>(ReduceType::kReduceAny);
+    }
+  }
+
+  // set block and grid for launch kernel
+  // for ReduceHigherDim: if block is enough -> splite reduce_num
+  //                     else init block(32, 1) grid(block_num, 1)
+  // for others: block(block_num, 1) , grid(left_num, 1)
+  void SetBlockDim() {
+    // init
+    int block_num = detail::GetDesiredBlockDim(reduce_num);
+    should_reduce_again = false;
+
+    dim3 block_dim(block_num, 1);
+    dim3 grid_dim(left_num, 1);
+    blocking_size = reduce_num;
+
+    if (reduce_type == ReduceType::kReduceHigherDim) {
+      int last_dim_num = x_dim.back();
+      // update left_num
+      int grid_z = left_num / last_dim_num;
+      left_num = last_dim_num;
+
+      block_dim.z = 1;
+      grid_dim.z = grid_z;
+
+      int device_id = platform::GetCurrentDeviceId();
+      int max_mp = platform::GetCUDAMultiProcessors(device_id);
+      int max_threads_per_mp =
+          platform::GetCUDAMaxThreadsPerMultiProcessor(device_id);
+      int max_threads = max_threads_per_mp * max_mp;
+
+      // init
+      int num_block = (max_threads / left_num);
+
+      if (num_block > 1 && reduce_num >= 512) {
+        blocking_size = detail::GetLastPow2(reduce_num / num_block);
+
+        if (blocking_size <= 1) {
+          blocking_size = detail::GetLastPow2(sqrt(reduce_num));
+        } else if (blocking_size * 2 < reduce_num) {
+          blocking_size *= 2;
+        }
+
+        should_reduce_again = true;
+
+        block_dim.x = 32;
+        block_dim.y = 1;
+        grid_dim.x = (left_num + block_dim.x - 1) / block_dim.x;
+        grid_dim.y = (reduce_num + blocking_size - 1) / blocking_size;
+
+      } else {
+        block_dim.x = 32;
+        block_dim.y = 1;
+        blocking_size = reduce_num;
+        grid_dim.x = (left_num + block_dim.x - 1) / block_dim.x;
+        grid_dim.y = 1;
+      }
+    }
+
+    block = block_dim;
+    grid = grid_dim;
+  }
+
+ public:
+  std::vector<int> reduce_dims_origin;
+  std::vector<int> reduce_dim;
+  std::vector<int> x_dim;
+  std::vector<int> left_dim;
+  std::vector<int> x_strides;
+  std::vector<int> left_strides;
+  std::vector<int> reduce_strides;
+
+  int reduce_type;
+  int reduce_num;
+  int left_num;
+  int blocking_size;
+  bool should_reduce_again;
+
+  Ty* output_data;
+
+  dim3 block;
+  dim3 grid;
+};
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim>
+__device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y,
+                                              ReduceOp reducer,
+                                              TransformOp transformer, Ty init,
+                                              int reduce_num) {
+  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
+  int idx_x = blockIdx.x * reduce_num;
+  int idx_y = threadIdx.x;
+  Ty reduce_var = init;
+  for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim)
+    reduce_var = reducer(reduce_var, static_cast<Ty>(x[idx_x + idx_y]));
+  __syncthreads();
+
+  reduce_var =
+      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
+
+  if (threadIdx.x == 0) {
+    y[blockIdx.x] = transformer(reduce_var);
+  }
+}
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
+__device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y,
+                                                ReduceOp reducer,
+                                                TransformOp transformer,
+                                                Ty init, int reduce_num,
+                                                int left_num, int block_size) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int idy = blockIdx.y * block_size;
+
+  Ty temp = init;
+  Ty reduce_var = init;
+
+  if (idx < left_num) {
+    int loop = reduce_num - idy;
+    loop = loop > block_size ? block_size : loop;
+    for (int iy = 0; iy < loop; iy++) {
+      int id = (idy + iy) * left_num + idx + blockIdx.z * reduce_num * left_num;
+      reduce_var = reducer(reduce_var, static_cast<Ty>(x[id]));
+    }
+    y[idx + blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num] =
+        static_cast<Ty>(transformer(reduce_var));
+  }
+}
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim, int Rank, int ReduceRank>
+__device__ __forceinline__ void ReduceAny(
+    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
+    int reduce_num, paddle::framework::Array<int, Rank> x_strides,
+    paddle::framework::Array<int, ReduceRank> reduce_dim,
+    paddle::framework::Array<int, ReduceRank> reduce_strides,
+    paddle::framework::Array<int, Rank - ReduceRank> left_dim,
+    paddle::framework::Array<int, Rank - ReduceRank> left_strides) {
+  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
+
+  int sub_index[Rank];
+  int left_idx = blockIdx.x;
+  for (int i = 0; i < Rank - ReduceRank; ++i) {
+    sub_index[left_dim[i]] = left_idx / left_strides[i];
+    left_idx %= left_strides[i];
+  }
+
+  int reduce_idx = threadIdx.x;
+  for (int j = 0; j < ReduceRank; ++j) {
+    sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
+    reduce_idx %= reduce_strides[j];
+  }
+
+  int idx_x = 0;
+  for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
+  Ty reduce_var = static_cast<Ty>(x[idx_x]);
+
+  for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
+    int reduce_idx = i;
+    for (int j = 0; j < ReduceRank; ++j) {
+      sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
+      reduce_idx %= reduce_strides[j];
+    }
+
+    int idx_x = 0;
+    for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
+    reduce_var =
+        static_cast<Ty>(reducer(reduce_var, static_cast<Ty>(x[idx_x])));
+  }
+  __syncthreads();
+
+  reduce_var =
+      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
+
+  if (threadIdx.x == 0) {
+    y[blockIdx.x] = transformer(reduce_var);
+  }
+}
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim, int Rank, int ReduceRank, int ReduceType>
+__device__ __forceinline__ void ReduceModule(
+    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
+    int reduce_num, int left_num, int blocking_size,
+    paddle::framework::Array<int, Rank> x_strides,
+    paddle::framework::Array<int, ReduceRank> reduce_dim,
+    paddle::framework::Array<int, ReduceRank> reduce_strides,
+    paddle::framework::Array<int, Rank - ReduceRank> left_dim,
+    paddle::framework::Array<int, Rank - ReduceRank> left_strides) {
+  if (ReduceType == ReduceType::kReduceLastDim) {
+    ReduceLastDim<Tx, Ty, ReduceOp, TransformOp, BlockDim>(
+        x, y, reducer, transformer, init, reduce_num);
+
+  } else if (ReduceType == ReduceType::kReduceHigherDim) {
+    ReduceHigherDim<Tx, Ty, ReduceOp, TransformOp>(
+        x, y, reducer, transformer, init, reduce_num, left_num, blocking_size);
+
+  } else {
+    ReduceAny<Tx, Ty, ReduceOp, TransformOp, BlockDim, Rank, ReduceRank>(
+        x, y, reducer, transformer, init, reduce_num, x_strides, reduce_dim,
+        reduce_strides, left_dim, left_strides);
+  }
+}
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim, int Rank, int ReduceRank, int ReduceType>
+__global__ void ReduceKernelFunction(
+    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
+    int reduce_num, int left_num, int block_size,
+    paddle::framework::Array<int, Rank> x_strides,
+    paddle::framework::Array<int, ReduceRank> reduce_dim,
+    paddle::framework::Array<int, ReduceRank> reduce_strides,
+    paddle::framework::Array<int, Rank - ReduceRank> left_dim,
+    paddle::framework::Array<int, Rank - ReduceRank> left_strides) {
+  ReduceModule<Tx, Ty, ReduceOp, TransformOp, BlockDim, Rank, ReduceRank,
+               ReduceType>(x, y, reducer, transformer, init, reduce_num,
+                           left_num, block_size, x_strides, reduce_dim,
+                           reduce_strides, left_dim, left_strides);
+}
+
+template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
+          typename TransformOp, int kRank, int kReduceRank>
+static void launchKernel(const Tx* x_data, Ty* y_data,
+                         const platform::Place& place, const ReduceOp& reducer,
+                         const TransformOp& transformer, const Ty& init,
+                         gpuStream_t stream, ReduceConfig<Ty> config) {
+#define CUB_REDUCE_TYPE_CASE(type)                                    \
+  case type: {                                                        \
+    constexpr auto kReduceType = type;                                \
+    ReduceKernelFunction<                                             \
+        Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, kReduceRank,  \
+        kReduceType><<<config.grid, config.block, 0, stream>>>(       \
+        x_data, config.output_data, reducer, transformer, init,       \
+        config.reduce_num, config.left_num, config.blocking_size,     \
+        detail::from<int, kRank>(config.x_strides),                   \
+        detail::from<int, kReduceRank>(config.reduce_dim),            \
+        detail::from<int, kReduceRank>(config.reduce_strides),        \
+        detail::from<int, kRank - kReduceRank>(config.left_dim),      \
+        detail::from<int, kRank - kReduceRank>(config.left_strides)); \
+  } break
+
+  switch (config.reduce_type) {
+    CUB_REDUCE_TYPE_CASE(1);  // reduceLastDim
+    CUB_REDUCE_TYPE_CASE(2);  // ReduceHigherDim
+    CUB_REDUCE_TYPE_CASE(3);  // reduceAny
+  }
+
+  if (config.should_reduce_again) {
+    dim3 block(config.block.x, 1, 1);
+    dim3 grid(config.grid.x, 1, config.grid.z);
+
+    ReduceKernelFunction<
+        Ty, Ty, ReduceOp, detail::IdentityFunctor<Ty>, 128, kRank, kReduceRank,
+        ReduceType::kReduceHigherDim><<<grid, block, 0, stream>>>(
+        config.output_data, y_data, reducer, detail::IdentityFunctor<Ty>(),
+        init, config.grid.y, config.left_num, config.grid.y,
+        detail::from<int, kRank>(config.x_strides),
+        detail::from<int, kReduceRank>(config.reduce_dim),
+        detail::from<int, kReduceRank>(config.reduce_strides),
+        detail::from<int, kRank - kReduceRank>(config.left_dim),
+        detail::from<int, kRank - kReduceRank>(config.left_strides));
+  }
+}
+
+template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
+          typename TransformOp>
+static void launchReduceKernel(const Tx* x_data, Ty* y_data,
+                               const platform::Place& place,
+                               const ReduceOp& reducer,
+                               const TransformOp& transformer, const Ty& init,
+                               gpuStream_t stream, ReduceConfig<Ty> config) {
+  int reduce_rank = config.reduce_strides.size();
+  int rank = config.x_strides.size();
+
+#define CUB_RANK_CASE(i, ...)             \
+  case i: {                               \
+    constexpr auto kRank = i;             \
+    switch (reduce_rank) { __VA_ARGS__; } \
+  } break
+
+#define CUB_REDUCE_RANK_CASE(i, ...)                                           \
+  case i: {                                                                    \
+    constexpr auto kReduceRank = i;                                            \
+    launchKernel<Tx, Ty, BlockDim, ReduceOp, TransformOp, kRank, kReduceRank>( \
+        x_data, y_data, place, reducer, transformer, init, stream, config);    \
+  } break
+
+  // launch CUB::Reduce
+  if (config.reduce_type == static_cast<int>(ReduceType::kReduceAll)) {
+    cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
+        x_data, transformer);
+    size_t temp_storage_bytes = 0;
+    cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
+                              config.reduce_num, reducer, init, stream);
+    framework::Tensor tmp;
+    auto* temp_storage = tmp.mutable_data<uint8_t>(
+        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
+        place);
+    cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
+                              config.reduce_num, reducer, init, stream);
+
+    return;
+  }
+
+  detail::CheckReduceRankIsValid(reduce_rank, rank);
+  switch (rank) {
+    CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
+
+    CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2););
+
+    CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(2););
+
+    CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(2); CUB_REDUCE_RANK_CASE(3););
+
+    CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(3););
+
+    CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4););
+
+    CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(4););
+
+    CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(4); CUB_REDUCE_RANK_CASE(5););
+  }
+
+#undef CUB_REDUCE_RANK_CASE
+#undef CUB_RANK_CASE
+}
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
+void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
+                      std::vector<int> origin_reduce_dims, const Ty& init,
+                      const ReduceOp& reducer, const TransformOp& transformer,
+                      gpuStream_t stream) {
+  auto x_dim = framework::vectorize<int>(x.dims());
+  auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
+  config.Run();
+
+  auto x_data = x.data<Tx>();
+  auto y_data = y->mutable_data<Ty>(x.place());
+
+  framework::Tensor tmp;
+  // SetOutputData for ReduceHigherDim when should_reduce_again is true,
+  //   temp_output should be stored temp_data in output_data space or stored in
+  //   y_data;
+  config.SetOutputData(y_data, x.place(), tmp);
+
+  if (config.reduce_num == 1) {
+    auto out_dims = y->dims();
+    framework::TensorCopy(x, y->place(), y);
+    y->Resize(out_dims);
+    return;
+  }
+
+#define CUB_BLOCK_DIM_CASE(block_dim)                                  \
+  case block_dim: {                                                    \
+    constexpr auto kBlockDim = block_dim;                              \
+    launchReduceKernel<Tx, Ty, block_dim, ReduceOp, TransformOp>(      \
+        x_data, y_data, x.place(), reducer, transformer, init, stream, \
+        config);                                                       \
+  } break
+
+  switch (detail::GetDesiredBlockDim(config.reduce_num)) {
+    CUB_BLOCK_DIM_CASE(512);
+    CUB_BLOCK_DIM_CASE(256);
+    CUB_BLOCK_DIM_CASE(128);
+    CUB_BLOCK_DIM_CASE(64);
+    CUB_BLOCK_DIM_CASE(32);
+    CUB_BLOCK_DIM_CASE(16);
+    CUB_BLOCK_DIM_CASE(8);
+    CUB_BLOCK_DIM_CASE(4);
+    CUB_BLOCK_DIM_CASE(2);
+  }
+#undef CUB_BLOCK_DIM_CASE
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 5a8e8894e1c5d..a085e851eea77 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -109,8 +109,10 @@ REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp,
                   ops::ReduceSumGradNoNeedBufferVarInferer);
 
 REGISTER_OP_CPU_KERNEL(
-    reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+    reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, bool,
                                   ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                      ops::SumFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
                       ops::SumFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
@@ -128,7 +130,8 @@ using CPUReduceSumGradKernel =
     ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, T,
                              ops::SumGradFunctor, true>;
 
-REGISTER_OP_CPU_KERNEL(reduce_sum_grad, CPUReduceSumGradKernel<float>,
+REGISTER_OP_CPU_KERNEL(reduce_sum_grad, CPUReduceSumGradKernel<bool>,
+                       CPUReduceSumGradKernel<float>,
                        CPUReduceSumGradKernel<double>,
                        CPUReduceSumGradKernel<int>,
                        CPUReduceSumGradKernel<int64_t>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
index 219cc231a1ea7..dbd020514b208 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
@@ -70,7 +70,8 @@ class ReduceSumKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
+REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<bool>,
+                        ops::ReduceSumKernel<float>,
                         ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
                         ops::ReduceSumKernel<int64_t>,
                         ops::ReduceSumKernel<paddle::platform::complex64>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
index f2bee6dddc39e..67de8bb9a0c1a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
@@ -20,7 +20,8 @@ using CUDAReduceSumGradKernel =
     ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, T,
                           ops::SumGradFunctor, true>;
 
-REGISTER_OP_CUDA_KERNEL(reduce_sum_grad, CUDAReduceSumGradKernel<float>,
+REGISTER_OP_CUDA_KERNEL(reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
+                        CUDAReduceSumGradKernel<float>,
                         CUDAReduceSumGradKernel<double>,
                         CUDAReduceSumGradKernel<int>,
                         CUDAReduceSumGradKernel<int64_t>,
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index a9b1f299dab82..a71f49585bfca 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -54,6 +54,21 @@ class ScaleOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -87,6 +102,9 @@ if bias_after_scale=True:
         "Apply bias addition after or before scaling. It is useful for "
         "numeric stability in some circumstances.")
         .SetDefault(true);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
   }
 };
 
@@ -112,6 +130,8 @@ class ScaleGradMaker : public framework::SingleGradOpMaker<T> {
     grad_op->SetAttr("scale", this->GetAttr("scale"));
     grad_op->SetAttr("bias", 0.0f);
     grad_op->SetAttr("bias_after_scale", true);
+    if (grad_op->HasAttr("use_mkldnn"))
+      grad_op->SetAttr("use_mkldnn", this->GetAttr("use_mkldnn"));
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 4aec4c1742279..8fe456edeabf1 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -15,44 +15,481 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/softmax_impl.cuh"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 #include "paddle/fluid/platform/for_range.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/miopen_helper.h"
+#else
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
 
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using DataLayout = platform::DataLayout;
 using Tensor = framework::Tensor;
 
-namespace {
+// Wrapper of log function. Use log(float32) for float16
 template <typename T>
-__global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
-                                 const int64_t n, const int64_t d,
-                                 const int64_t remain, const int ignore_index) {
-  CUDA_KERNEL_LOOP_TYPE(index, n * remain, int64_t) {
-    int64_t idx_n = index / remain;
-    int64_t idx_remain = index % remain;
-    int64_t tmp = labels[index];
-    if (ignore_index != tmp) {
-      int64_t idx = idx_n * d + tmp * remain + idx_remain;
-      logit_grad[idx] -= static_cast<T>(1.);
+static __device__ __forceinline__ T Log(T x) {
+  using AccT = typename details::MPTypeTrait<T>::Type;
+  AccT logx = std::log(static_cast<AccT>(x));
+  return math::TolerableValue<T>()(static_cast<T>(logx));
+}
+
+// Wrapper of exp function. Use exp(float32) for float16
+template <typename T>
+static __device__ __forceinline__ T Exp(T x) {
+  using AccT = typename details::MPTypeTrait<T>::Type;
+  AccT expx = std::exp(static_cast<AccT>(x));
+  return math::TolerableValue<T>()(static_cast<T>(expx));
+}
+
+// log2(value)
+static inline int Log2Ceil(int value) {
+  int log2_value = 0;
+  while ((1 << log2_value) < value) ++log2_value;
+  return log2_value;
+}
+
+enum class SoftmaxMode { kSoftmax, kLogSoftmax, kCrossEntropy };
+
+/*
+  Hard label cross entropy.
+*/
+template <typename T, bool IgnoreIndex>
+__global__ void CrossEntropyHardLabel(T* loss, const T* softmax,
+                                      const int64_t* labels, const int n,
+                                      const int dim, const int d,
+                                      const int ignore_idx) {
+  int64_t ids = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t idx_n = ids / d;
+  int64_t idx_d = ids % d;
+
+  // thread ids compute loss[ids] using softmax[idx]
+  if (ids < n * d) {
+    int64_t idx = idx_n * dim * d + labels[ids] * d + idx_d;
+    if (IgnoreIndex == true) {
+      // IgnoreIndex is true
+      if (labels[ids] == ignore_idx) {
+        loss[ids] = static_cast<T>(0.0);
+      } else {
+        loss[ids] = -Log(softmax[idx]);
+      }
+    } else {
+      // IgnoreIndex is false
+      loss[ids] = -Log(softmax[idx]);
     }
   }
 }
 
+/*
+  Hard label cross entropy with exp.
+  Input: log softmax
+  Output: loss and exp(input)
+*/
+template <typename T, bool IgnoreIndex>
+__global__ void CrossEntropyExpHardLabel(T* loss, T* softmax,
+                                         const int64_t* labels, const int n,
+                                         const int dim, const int d,
+                                         const int ignore_idx) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t idx_n = idx / (d * dim);
+  int64_t idx_dim = (idx / d) % dim;
+  int64_t idx_d = idx % d;
+  int64_t ids = idx_n * d + idx_d;
+
+  if (idx < n * dim * d) {
+    if (IgnoreIndex == true) {
+      // IgnoreIndex is true
+      if (idx_dim == labels[ids]) {
+        if (labels[ids] == ignore_idx) {
+          loss[ids] = static_cast<T>(0.0);
+        } else {
+          loss[ids] = -softmax[idx];
+        }
+      }
+    } else {
+      // IgnoreIndex is false
+      if (labels[ids] >= 0 && labels[ids] < dim) {
+        if (labels[ids] == idx_dim) {
+          loss[ids] = -softmax[idx];
+        }
+      } else {
+        loss[ids] = static_cast<T>(0.0);
+      }
+    }
+    softmax[idx] = Exp(softmax[idx]);
+  }
+}
+
+/*
+  Core function of softmax with cross entropy forward
+    - softmax, SoftmaxMode=kSoftmax
+    - log softmax, SoftmaxMode=kLogSoftmax
+    - softmax with cross entropy hard label, SoftmaxMode=kCrossEntropy
+  The computation includes
+    - Compute max value: maxvalue_{i} = max_j src_{i,j}
+    - Compute sum of exp: s_{i} = sum_{j}{e^{src_{i,j} - maxvalue_{i}}}
+    - Compute: softmax_{i,j} = e^{src_{i,j} - maxvalue_{i}} / s_{i}
+    - Compute: logsoftmax_{i,j} = src_{i,j} - maxvalue_{i} - log(s_{i})
+    - Compute: loss_{i} = -logsoftmax[i,label[i]] (Hard label)
+  This computation results from following formula:
+    softmax_{i,j} = e^{src_{i,j}} / sum_{j}{e^{src_{i,j}}}
+                  = e^{src_{i,j} - maxvalue_{i}}
+                    / sum_{j}{e^{src_{i,j} - maxvalue_{i}}}
+                  = e^{src_{i,j} - maxvalue_{i}} / s_{i}
+    logsoftmax_{i,j} = log(softmax_{i,j})
+                     = src_{i,j} - maxvalue_{i} - log(s_{i})
+  One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
+  For reduction max (sum), firstly compute max (sum) to one warp, then use
+  shuffle api to compute max (sum) in one warp.
+*/
+template <typename T, typename VecT, typename AccT, int Log2Elements,
+          SoftmaxMode mode, bool IgnoreIndex>
+__global__ void WarpSoftmaxForward(T* loss, T* softmax, const T* src,
+                                   const int64_t* label, const int batch_size,
+                                   const int stride, const int element_count,
+                                   const int ignore_index) {
+  constexpr int kDimCeil = 1 << Log2Elements;
+  constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+  constexpr int kVSize = sizeof(VecT) / sizeof(T);
+  constexpr int kIterations = kDimCeil / kWarpSize;
+  constexpr int kIterationsV =
+      (kIterations >= kVSize) ? (kIterations / kVSize) : 1;
+  constexpr int kBatchSize = (kDimCeil <= 128) ? 2 : 1;
+
+  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize;
+
+  // max index to read
+  int idx_max_v[kBatchSize];
+#pragma unroll
+  for (int i = 0; i < kBatchSize; i++) {
+    int idx_max = ((i + first_batch) < batch_size) ? element_count : 0;
+    idx_max_v[i] = idx_max / kVSize;
+  }
+
+  // read data from global memory
+  AccT srcdata[kBatchSize][kIterationsV][kVSize];
+
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
+// read data to srcdata: - KVSize==1, - KVSize>1
+#pragma unroll
+    for (int it = 0; it < kIterationsV; ++it) {
+      int src_idx = threadIdx.x + it * kWarpSize;
+      if (kVSize == 1) {
+        if (src_idx < idx_max_v[i]) {
+          srcdata[i][it][0] =
+              static_cast<AccT>(src[(first_batch + i) * stride + src_idx]);
+        } else {
+          srcdata[i][it][0] = -std::numeric_limits<AccT>::infinity();
+        }
+      } else {
+        const VecT* src_v =
+            reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
+        if (src_idx < idx_max_v[i]) {
+          VecT srctmp = src_v[src_idx];
+          const T* srcinptr = reinterpret_cast<const T*>(&srctmp);
+#pragma unroll
+          for (int s = 0; s < kVSize; s++) {
+            srcdata[i][it][s] = static_cast<AccT>(srcinptr[s]);
+          }
+        } else {
+#pragma unroll
+          for (int s = 0; s < kVSize; s++) {
+            srcdata[i][it][s] = -std::numeric_limits<AccT>::infinity();
+          }
+        }
+      }
+    }
+  }
+
+  // compute max value: maxvalue_{i} = max_j src_{i,j}
+  AccT max_value[kBatchSize];
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
+    // it = 0
+    AccT valmax = srcdata[i][0][0];
+#pragma unroll
+    for (int s = 1; s < kVSize; ++s) {
+      valmax = (valmax > srcdata[i][0][s]) ? valmax : srcdata[i][0][s];
+    }
+    max_value[i] = valmax;
+
+// it = 1, 2, ...
+#pragma unroll
+    for (int it = 1; it < kIterationsV; ++it) {
+      AccT valmax = srcdata[i][it][0];
+#pragma unroll
+      for (int s = 1; s < kVSize; ++s) {
+        valmax = (valmax > srcdata[i][it][s]) ? valmax : srcdata[i][it][s];
+      }
+      max_value[i] = (max_value[i] > valmax) ? max_value[i] : valmax;
+    }
+  }
+  WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
+
+  // compute sum: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} }
+  AccT sum[kBatchSize];
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
+    // it = 0
+    if (mode == SoftmaxMode::kLogSoftmax ||
+        mode == SoftmaxMode::kCrossEntropy) {
+      sum[i] = std::exp(srcdata[i][0][0] - max_value[i]);
+    } else {
+      srcdata[i][0][0] = std::exp(srcdata[i][0][0] - max_value[i]);
+      sum[i] = srcdata[i][0][0];
+    }
+#pragma unroll
+    for (int s = 1; s < kVSize; ++s) {
+      if (mode == SoftmaxMode::kLogSoftmax ||
+          mode == SoftmaxMode::kCrossEntropy) {
+        sum[i] += std::exp(srcdata[i][0][s] - max_value[i]);
+      } else {
+        srcdata[i][0][s] = std::exp(srcdata[i][0][s] - max_value[i]);
+        sum[i] += srcdata[i][0][s];
+      }
+    }
+
+// it = 1, 2, ...
+#pragma unroll
+    for (int it = 1; it < kIterationsV; ++it) {
+#pragma unroll
+      for (int s = 0; s < kVSize; ++s) {
+        if (mode == SoftmaxMode::kLogSoftmax ||
+            mode == SoftmaxMode::kCrossEntropy) {
+          sum[i] += std::exp(srcdata[i][it][s] - max_value[i]);
+        } else {
+          srcdata[i][it][s] = std::exp(srcdata[i][it][s] - max_value[i]);
+          sum[i] += srcdata[i][it][s];
+        }
+      }
+    }
+  }
+  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
+
+// write data
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
+    if (mode == SoftmaxMode::kLogSoftmax ||
+        mode == SoftmaxMode::kCrossEntropy) {
+      sum[i] = std::log(sum[i]);
+    }
+
+#pragma unroll
+    for (int it = 0; it < kIterationsV; ++it) {
+      int idx = threadIdx.x + it * kWarpSize;
+      if (kVSize == 1) {  // kVSize==1
+        if (idx < idx_max_v[i]) {
+          if (mode == SoftmaxMode::kLogSoftmax) {  // log softmax
+            softmax[(first_batch + i) * stride + idx] =
+                srcdata[i][it][0] - max_value[i] - sum[i];
+            // softmax with cross entropy hard label
+          } else if (mode == SoftmaxMode::kCrossEntropy) {
+            AccT logsoftmax = srcdata[i][it][0] - max_value[i] - sum[i];
+            // softmax
+            softmax[(first_batch + i) * stride + idx] = std::exp(logsoftmax);
+            // label
+            int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize;
+            if (IgnoreIndex == true) {
+              // IgnoreIndex is true
+              if (label[first_batch + i] == loss_idx) {
+                if (label[first_batch + i] != ignore_index) {
+                  loss[first_batch + i] = -logsoftmax;
+                } else {
+                  loss[first_batch + i] = static_cast<T>(0.0);
+                }
+              }
+            } else {
+              // IgnoreIndex is false
+              if (label[first_batch + i] >= 0 &&
+                  label[first_batch + i] < element_count) {
+                if (label[first_batch + i] == loss_idx) {
+                  loss[first_batch + i] = -logsoftmax;
+                }
+              } else {
+                loss[first_batch + i] = static_cast<T>(0.0);
+              }
+            }
+          } else {  // softmax
+            softmax[(first_batch + i) * stride + idx] =
+                srcdata[i][it][0] / sum[i];
+          }
+        } else {
+          break;
+        }
+      } else {  // KVSize>1
+        VecT* softmax_v =
+            reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
+        VecT tmpdata;
+        T* tmpptr = reinterpret_cast<T*>(&tmpdata);
+#pragma unroll
+        for (int s = 0; s < kVSize; ++s) {
+          if (mode == SoftmaxMode::kLogSoftmax) {  // log softmax
+            tmpptr[s] = srcdata[i][it][s] - max_value[i] - sum[i];
+            // softmax with cross entropy hard label
+          } else if (mode == SoftmaxMode::kCrossEntropy) {
+            AccT logsoftmax = srcdata[i][it][s] - max_value[i] - sum[i];
+            // softmax
+            tmpptr[s] = std::exp(logsoftmax);
+            // label
+            int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize + s;
+            if (IgnoreIndex == true) {
+              // IgnoreIndex is true
+              if (label[first_batch + i] == loss_idx &&
+                  label[first_batch + i] != ignore_index) {
+                loss[first_batch + i] = -logsoftmax;
+              }
+            } else {
+              // IgnoreIndex is false
+              if (label[first_batch + i] >= 0 &&
+                  label[first_batch + i] < element_count) {
+                if (label[first_batch + i] == loss_idx) {
+                  loss[first_batch + i] = -logsoftmax;
+                }
+              } else {
+                loss[first_batch + i] = static_cast<T>(0.0);
+              }
+            }
+          } else {  // softmax
+            tmpptr[s] = srcdata[i][it][s] / sum[i];
+          }
+        }
+        if (idx < idx_max_v[i]) {
+          softmax_v[idx] = tmpdata;
+        } else {
+          break;
+        }
+      }
+    }
+  }
+}
+
+#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, VecT, AccT)           \
+  case Log2Elements:                                                  \
+    WarpSoftmaxForward<T, VecT, AccT, Log2Elements, mode,             \
+                       IgnoreIndex><<<blocks, threads, 0, stream>>>(  \
+        loss, softmax, src, label, batch_size, stride, element_count, \
+        ignore_index);                                                \
+    break;
+
+/*
+  Wrapper of softmax with cross entropy forward hard label.
+*/
+template <typename T, SoftmaxMode mode, bool IgnoreIndex>
+void SwitchWarpSoftmaxForward(T* loss, T* softmax, const T* src,
+                              const int64_t* label, const int batch_size,
+                              const int stride, const int element_count,
+                              const int ignore_index, gpuStream_t stream) {
+  using AccT = typename details::MPTypeTrait<T>::Type;
+
+  // use 128 threads per block to maximimize gpu utilization
+  const int Log2Elements = static_cast<int>(Log2Ceil(element_count));
+  const int kDimCeil = 1 << Log2Elements;
+  int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+  int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
+  constexpr int threads_per_block = 128;
+  int warps_per_block = (threads_per_block / kWarpSize);
+  int batches_per_block = warps_per_block * batches_per_warp;
+  int blocks = (batch_size + batches_per_block - 1) / batches_per_block;
+  dim3 threads(kWarpSize, warps_per_block, 1);
+
+  switch (Log2Elements) {
+    SOFTMAX_WARP_FORWARD_CASE(0, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(1, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(2, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(3, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(4, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(5, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(6, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(7, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(8, T, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(9, T, AccT);
+    default:
+      break;
+  }
+}
+
+/*
+  Wrapper of softmax with cross entropy hard label.
+  - SwitchWarpSoftmaxForward for small size
+  - cudnn function for large size
+*/
+template <typename T, bool IgnoreIndex>
+static void SoftmaxWithCrossEntropyHardLabel(
+    const platform::CUDADeviceContext& ctx, int rank, int axis,
+    const T* logits_data, const int64_t* labels_data, T* loss_data,
+    T* softmax_data, int N, int dim, int D, const int ignore_index) {
+  auto stream = ctx.stream();
+  constexpr int max_dim = 320;
+  if (D == 1 && dim <= max_dim) {  // small size
+    const SoftmaxMode mode = SoftmaxMode::kCrossEntropy;
+    SwitchWarpSoftmaxForward<T, mode, IgnoreIndex>(
+        loss_data, softmax_data, logits_data, labels_data, N, dim, dim,
+        ignore_index, stream);
+  } else {
+    ScopedTensorDescriptor desc;
+    std::vector<int> tensor_dims = {N, dim, D, 1};
+    DataLayout layout = DataLayout::kNCHW;
+#ifdef PADDLE_WITH_HIP
+    miopenTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
+#else
+    cudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
+#endif
+
+    auto handle = ctx.cudnn_handle();
+
+#ifdef PADDLE_WITH_HIP
+    auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
+                                 : MIOPEN_SOFTMAX_MODE_CHANNEL;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+        handle, platform::CudnnDataType<T>::kOne(), descp, logits_data,
+        platform::CudnnDataType<T>::kZero(), descp, softmax_data,
+        MIOPEN_SOFTMAX_LOG, mode));
+#else
+    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
+                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+        handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
+        descp, logits_data, platform::CudnnDataType<T>::kZero(), descp,
+        softmax_data));
+#endif
+    int threads = 128;
+    int blocks = (N * dim * D + threads - 1) / threads;
+    // compute cross entropy, input is log softmax
+    CrossEntropyExpHardLabel<T, IgnoreIndex><<<blocks, threads, 0, stream>>>(
+        loss_data, softmax_data, labels_data, N, dim, D, ignore_index);
+  }
+}
+
+/*
+  Wrapper of softmax with cross entropy grad hard label.
+*/
 template <typename T>
-__global__ void Scale(T* logit_grad, const T* loss_grad, const int64_t num,
-                      const int64_t d, const int64_t remain,
-                      const int64_t* labels, const int ignore_index) {
-  CUDA_KERNEL_LOOP_TYPE(index, num, int64_t) {
-    int64_t idx_n = index / d;
-    int64_t idx_remain = index % remain;
-    int64_t idx_lbl = idx_n * remain + idx_remain;
-    if (labels[idx_lbl] == ignore_index) {
-      logit_grad[index] = static_cast<T>(0.);
+__global__ void SoftmaxWithCrossEntropyGradHardLabel(
+    T* logits_grad, const T* loss_grad, const int64_t* labels, const int64_t n,
+    const int64_t dim, const int64_t d, const int ignore_index) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t idx_n = idx / (d * dim);
+  int64_t idx_dim = (idx / d) % dim;
+  int64_t idx_d = idx % d;
+  int64_t ids = idx_n * d + idx_d;
+
+  if (idx < n * dim * d) {
+    if (labels[ids] == ignore_index) {
+      logits_grad[idx] = static_cast<T>(0.0);
+    } else if (labels[ids] == idx_dim) {
+      logits_grad[idx] =
+          (logits_grad[idx] - static_cast<T>(1.0)) * loss_grad[ids];
     } else {
-      logit_grad[index] *= loss_grad[idx_lbl];
+      logits_grad[idx] *= loss_grad[ids];
     }
   }
 }
@@ -123,8 +560,6 @@ __global__ void ScaleCrossEntropyGradient(T* logit_grad, const T* loss_grad,
   }
 }
 
-}  // namespace
-
 static __device__ __forceinline__ platform::float16 exp_on_device(
     platform::float16 x) {
   return ::Eigen::numext::exp(x);
@@ -396,278 +831,6 @@ static __global__ void RowReductionForCrossEntropy(const T* logits_data,
   if (threadIdx.x == 0) loss_data[blockIdx.x] = loss;
 }
 
-template <typename T>
-struct HardLabelCrossEntropyFunctor {
- public:
-  HardLabelCrossEntropyFunctor(const int64_t* labels, T* loss,
-                               const T* logits_data, int d, int axis_dim)
-      : labels_(labels),
-        loss_(loss),
-        logits_data_(logits_data),
-        d_(d),
-        axis_dim_(axis_dim) {}
-
-  __device__ void operator()(int idx) const {
-    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
-    int remain = d_ / axis_dim_;
-    int idx_n = idx / d_;
-    int idx_axis = (idx % d_) / remain;
-    int idx_remain = idx % remain;
-    // labels, loss view as [n, remain]
-    int idx_lbl = idx_n * remain + idx_remain;
-    // It also would ignore labels not in range(class_num).
-    if (idx_axis != labels_[idx_lbl]) {
-    } else {
-      loss_[idx_lbl] = -log_on_device(logits_data_[idx]);
-    }
-  }
-
- private:
-  const int64_t* labels_;
-  T* loss_;
-  const T* logits_data_;
-  int d_;
-  int axis_dim_;
-};
-
-template <typename T>
-struct HardLabelCrossEntropyFunctorWithIgnoreIdx {
- public:
-  HardLabelCrossEntropyFunctorWithIgnoreIdx(const int64_t* labels, T* loss,
-                                            const T* logits_data, int d,
-                                            int axis_dim, int ignore_idx)
-      : labels_(labels),
-        loss_(loss),
-        logits_data_(logits_data),
-        d_(d),
-        axis_dim_(axis_dim),
-        ignore_idx_(ignore_idx) {}
-
-  __device__ void operator()(int idx) const {
-    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
-    int remain = d_ / axis_dim_;
-    int idx_n = idx / d_;
-    int idx_axis = (idx % d_) / remain;
-    int idx_remain = idx % remain;
-    // labels, loss view as [n, remain]
-    int idx_lbl = idx_n * remain + idx_remain;
-
-    if (idx_axis == labels_[idx_lbl] && idx_axis != ignore_idx_) {
-      loss_[idx_lbl] = -log_on_device(logits_data_[idx]);
-    }
-  }
-
- private:
-  const int64_t* labels_;
-  T* loss_;
-  const T* logits_data_;
-  int d_;
-  int axis_dim_;
-  int ignore_idx_;
-};
-
-template <typename T>
-static void HardLabelCrossEntropy(const platform::CUDADeviceContext& ctx,
-                                  const T* logits_data,
-                                  const int64_t* labels_data, T* loss_data,
-                                  int n, int d, int axis_dim, int ignore_idx) {
-  constexpr int kMaxBlockDim = 512;
-  int block_dim = axis_dim >= kMaxBlockDim
-                      ? kMaxBlockDim
-                      : (1 << static_cast<int>(std::log2(axis_dim)));
-  int grid_dim = n * d / axis_dim;
-  auto stream = ctx.stream();
-
-#define CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)                \
-  case BlockDim: {                                                          \
-    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);   \
-    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                         \
-      for_range(HardLabelCrossEntropyFunctorWithIgnoreIdx<T>(               \
-          labels_data, loss_data, logits_data, d, axis_dim, ignore_idx));   \
-    } else {                                                                \
-      for_range(HardLabelCrossEntropyFunctor<T>(labels_data, loss_data,     \
-                                                logits_data, d, axis_dim)); \
-    }                                                                       \
-  } break
-
-  switch (block_dim) {
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(512);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(256);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(128);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(64);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(32);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(16);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(8);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(4);
-    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(2);
-    default:
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Block Dimension must be 2^n in softmax_with_cross_entropy_op."));
-      break;
-  }
-#undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
-}
-
-template <typename T>
-struct HardLabelSoftmaxWithCrossEntropyFunctor {
- public:
-  HardLabelSoftmaxWithCrossEntropyFunctor(const int64_t* labels, T* loss,
-                                          T* log_softmax, int64_t d,
-                                          int axis_dim, int ignore_idx)
-      : labels_(labels),
-        loss_(loss),
-        log_softmax_(log_softmax),
-        d_(d),
-        axis_dim_(axis_dim),
-        ignore_idx_(ignore_idx) {}
-
-  __device__ void operator()(int64_t idx) const {
-    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
-    int64_t remain = d_ / axis_dim_;
-    int64_t idx_n = idx / d_;
-    int64_t idx_axis = (idx % d_) / remain;
-    int64_t idx_remain = idx % remain;
-    // labels, loss view as [n, remain]
-    int64_t idx_lbl = idx_n * remain + idx_remain;
-    PADDLE_ENFORCE(labels_[idx_lbl] >= 0 && labels_[idx_lbl] < d_ ||
-                       labels_[idx_lbl] == ignore_idx_,
-                   "The value of label[%ld] expected >= 0 and < %ld, or == %d,"
-                   "but got %ld. Please check input value.",
-                   idx_lbl, d_, ignore_idx_, labels_[idx_lbl]);
-    // It also would ignore labels not in range(class_num).
-    if (idx_axis != labels_[idx_lbl]) {
-      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
-    } else {
-      auto softmax = log_softmax_[idx];
-      log_softmax_[idx] = exp_on_device(softmax);
-      loss_[idx_lbl] = -softmax;
-    }
-  }
-
- private:
-  const int64_t* labels_;
-  T* loss_;
-  T* log_softmax_;
-  int64_t d_;
-  int axis_dim_;
-  int ignore_idx_;
-};
-
-template <typename T>
-struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
- public:
-  HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx(const int64_t* labels,
-                                                       T* loss, T* log_softmax,
-                                                       int64_t d, int axis_dim,
-                                                       int ignore_idx)
-      : labels_(labels),
-        loss_(loss),
-        log_softmax_(log_softmax),
-        d_(d),
-        axis_dim_(axis_dim),
-        ignore_idx_(ignore_idx) {}
-
-  __device__ void operator()(int64_t idx) const {
-    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
-    int64_t remain = d_ / axis_dim_;
-    int64_t idx_n = idx / d_;
-    int64_t idx_axis = (idx % d_) / remain;
-    int64_t idx_remain = idx % remain;
-    // labels, loss view as [n, remain]
-    int64_t idx_lbl = idx_n * remain + idx_remain;
-    if (idx_axis != labels_[idx_lbl] || idx_axis == ignore_idx_) {
-      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
-    } else {
-      auto softmax = log_softmax_[idx];
-      log_softmax_[idx] = exp_on_device(softmax);
-      loss_[idx_lbl] = -softmax;
-    }
-  }
-
- private:
-  const int64_t* labels_;
-  T* loss_;
-  T* log_softmax_;
-  int64_t d_;
-  int axis_dim_;
-  int ignore_idx_;
-};
-
-template <typename T>
-static void HardLabelSoftmaxWithCrossEntropy(
-    const platform::CUDADeviceContext& ctx, const T* logits_data,
-    const int64_t* labels_data, T* loss_data, T* softmax_data, int64_t n,
-    int64_t d, int axis_dim, int ignore_idx) {
-#ifdef __HIPCC__
-  // HIP platform will have loss nan if dim size > 256
-  constexpr int kMaxBlockDim = 256;
-#else
-  constexpr int kMaxBlockDim = 512;
-#endif
-  int64_t block_dim = axis_dim >= kMaxBlockDim
-                          ? kMaxBlockDim
-                          : (1 << static_cast<int>(std::log2(axis_dim)));
-  int64_t grid_dim = n * d / axis_dim;
-  auto stream = ctx.stream();
-
-#ifdef __HIPCC__
-#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)      \
-  case BlockDim: {                                                             \
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForMax<T, BlockDim>),       \
-                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
-                       loss_data, d, axis_dim);                                \
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForSum<T, BlockDim>),       \
-                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
-                       loss_data, softmax_data, d, axis_dim);                  \
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForDiff<T, BlockDim>),      \
-                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
-                       loss_data, softmax_data, d, axis_dim);                  \
-    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);      \
-    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                            \
-      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(       \
-          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx));     \
-    } else {                                                                   \
-      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                    \
-          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx));     \
-    }                                                                          \
-  } break
-#else
-#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)  \
-  case BlockDim: {                                                         \
-    RowReductionForMax<T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>(    \
-        logits_data, loss_data, d, axis_dim);                              \
-    RowReductionForDiffMaxSum<T, BlockDim,                                 \
-                              true><<<grid_dim, BlockDim, 0, stream>>>(    \
-        logits_data, loss_data, softmax_data, d, axis_dim);                \
-    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);  \
-    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                        \
-      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(   \
-          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx)); \
-    } else {                                                               \
-      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                \
-          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx)); \
-    }                                                                      \
-  } break
-#endif
-
-  switch (block_dim) {
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
-    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
-    default:
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Block Dimension must be 2^n in softmax_with_cross_entropy_op."));
-      break;
-  }
-#undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
-}
-
 template <typename T>
 static void SoftmaxWithCrossEntropyFusedKernel(
     const T* logits_data, const T* labels_data, T* softmax_data, T* loss_data,
@@ -783,7 +946,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
 
       const int rank = softmax->dims().size();
       const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-      int axis_dim = softmax->dims()[axis];
+      const int axis_dim = softmax->dims()[axis];
 
       const int n = SizeToAxis(axis, softmax->dims());
       const int d = SizeFromAxis(axis, softmax->dims());
@@ -826,9 +989,19 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
       } else {  // HardLabel
         auto* logits_data = softmax->data<T>();
         auto* labels_data = labels->data<int64_t>();
-        HardLabelCrossEntropy<T>(context.cuda_device_context(), logits_data,
-                                 labels_data, loss_data, n, d, axis_dim,
-                                 ignore_index);
+        int threads = 128;
+        int blocks = (n * d / axis_dim + threads - 1) / threads;
+        if (ignore_index >= 0 && ignore_index < axis_dim) {
+          CrossEntropyHardLabel<T, true><<<
+              blocks, threads, 0, context.cuda_device_context().stream()>>>(
+              loss_data, logits_data, labels_data, n, axis_dim, d / axis_dim,
+              ignore_index);
+        } else {
+          CrossEntropyHardLabel<T, false><<<
+              blocks, threads, 0, context.cuda_device_context().stream()>>>(
+              loss_data, logits_data, labels_data, n, axis_dim, d / axis_dim,
+              ignore_index);
+        }
       }
 
       // cause of input is softmax
@@ -886,9 +1059,17 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
       } else {
         auto* logits_data = logits->data<T>();
         auto* labels_data = labels->data<int64_t>();
-        HardLabelSoftmaxWithCrossEntropy<T>(
-            context.cuda_device_context(), logits_data, labels_data, loss_data,
-            softmax_data, n, d, axis_dim, ignore_index);
+        if (ignore_index >= 0 && ignore_index < axis_dim) {
+          SoftmaxWithCrossEntropyHardLabel<T, true>(
+              context.cuda_device_context(), rank, axis, logits_data,
+              labels_data, loss_data, softmax_data, n, axis_dim, d / axis_dim,
+              ignore_index);
+        } else {
+          SoftmaxWithCrossEntropyHardLabel<T, false>(
+              context.cuda_device_context(), rank, axis, logits_data,
+              labels_data, loss_data, softmax_data, n, axis_dim, d / axis_dim,
+              ignore_index);
+        }
       }
     }
   }
@@ -959,14 +1140,11 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
       SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
           logit_grad_data, loss_grad_data, label_data, n, d, remain);
     } else {
-      int64_t grid = (n * remain + block - 1) / block;
       const int64_t* label_data = labels->data<int64_t>();
-      CrossEntropyGrad<T><<<grid, block, 0, stream>>>(
-          logit_grad_data, label_data, n, d, remain, ignore_index);
-      int64_t num = n * d;
-      grid = (num + block - 1) / block;
-      Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
-                                           d, remain, label_data, ignore_index);
+      int grid = (n * d + block - 1) / block;
+      SoftmaxWithCrossEntropyGradHardLabel<T><<<grid, block, 0, stream>>>(
+          logit_grad_data, loss_grad_data, label_data, n, d / remain, remain,
+          ignore_index);
     }
   }
 };
diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu
index 4800f5f9eb533..9e5e45f4d22d9 100644
--- a/paddle/fluid/operators/stack_op.cu
+++ b/paddle/fluid/operators/stack_op.cu
@@ -96,9 +96,10 @@ class StackGPUKernel : public framework::OpKernel<T> {
 };
 
 template <typename T, typename IntType>
-__global__ void UnStackCUDAKernel(const T* __restrict__ input, int pre_dim_size,
-                                  int split_dim_size, int suf_dim_size,
-                                  int num_split, T** output_ptrs) {
+__global__ void UnStackHelperCUDAKernel(const T* __restrict__ input,
+                                        int pre_dim_size, int split_dim_size,
+                                        int suf_dim_size, int num_split,
+                                        T** output_ptrs) {
   assert(blockDim.y == 1);
   assert(blockDim.z == 1);
   // In this case they are equal
@@ -114,6 +115,9 @@ __global__ void UnStackCUDAKernel(const T* __restrict__ input, int pre_dim_size,
     IntType k = offset % suf_dim_size;
 
     T* output = output_ptrs[j / each_dim_size];
+    if (output == nullptr) {
+      return;
+    }
     IntType output_ind = i * each_dim_size * suf_dim_size +
                          (j % each_dim_size) * suf_dim_size + k;
     *(output + output_ind) = input[offset];
@@ -142,6 +146,9 @@ class StackGradGPUKernel : public framework::OpKernel<T> {
     std::vector<T*> outputs(n);
     auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
     for (size_t j = 0; j < dx.size(); ++j) {
+      if (dx[j] == nullptr) {
+        outputs[j] = nullptr;
+      }
       if (out_var_names[j] != framework::kEmptyVarName &&
           dx[j]->numel() != 0UL) {
         T* ptr = dx[j]->mutable_data<T>(ctx.GetPlace());
@@ -170,13 +177,13 @@ class StackGradGPUKernel : public framework::OpKernel<T> {
     auto config = GetGpuLaunchConfig1D(dev_ctx, dy_pre * split_dim * dy_suf);
 
     if (dy->numel() < std::numeric_limits<int32_t>::max()) {
-      UnStackCUDAKernel<
+      UnStackHelperCUDAKernel<
           T, int32_t><<<config.block_per_grid.x, config.thread_per_block.x, 0,
                         dev_ctx.stream()>>>(
           dy_data, dy_pre, split_dim, dy_suf, split_dim,
           reinterpret_cast<T**>(tmp_out_data->ptr()));
     } else {
-      UnStackCUDAKernel<
+      UnStackHelperCUDAKernel<
           T, int64_t><<<config.block_per_grid.x, config.thread_per_block.x, 0,
                         dev_ctx.stream()>>>(
           dy_data, dy_pre, split_dim, dy_suf, split_dim,
diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc
index 684bd476b6ef2..9785e73a4044e 100644
--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ b/paddle/fluid/operators/top_k_op_npu.cc
@@ -48,7 +48,7 @@ class TopkNPUKernel : public framework::OpKernel<T> {
     size_t k = static_cast<int>(ctx.Attr<int>("k"));
 
     output->mutable_data<T>(ctx.GetPlace());
-    indices->mutable_data<int>(ctx.GetPlace());
+    indices->mutable_data<int64_t>(ctx.GetPlace());
 
     // prepare assit
     auto dim = input->dims().size();
@@ -62,15 +62,24 @@ class TopkNPUKernel : public framework::OpKernel<T> {
                                              {"dim", -1},
                                              {"largest", true}};
 
+    Tensor tmp_indices(framework::proto::VarType::INT32);
+    tmp_indices.Resize(indices->dims());
+    tmp_indices.mutable_data<int>(ctx.GetPlace());
+
     // run ascend
     auto runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor},
-                              {*output, *indices}, attr_input);
-
+                              {*output, tmp_indices}, attr_input);
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
-
     runner.Run(stream);
+
+    // cast indices from INT32 to INT64
+    auto dst_dtype = ConvertToNpuDtype(indices->type());
+    auto runner_cast_indices =
+        NpuOpRunner("Cast", {tmp_indices}, {*indices},
+                    {{"dst_type", static_cast<int>(dst_dtype)}});
+    runner_cast_indices.Run(stream);
   }
 };
 
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
index 2bd2a2cbf34c6..99793ecd244cf 100644
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -405,13 +405,13 @@ class UniqueKernel : public framework::OpKernel<T> {
     bool return_counts = context.Attr<bool>("return_counts");
 
     if (axis_vec.empty()) {
-      framework::VisitDataTypeSmall(
+      framework::VisitDataTypeTiny(
           data_type,
           UniqueFlattendTensorFunctor<DeviceContext, T>(
               context, *x, out, return_index, return_inverse, return_counts));
     } else {
       int axis = axis_vec[0];
-      framework::VisitDataTypeSmall(
+      framework::VisitDataTypeTiny(
           data_type, UniqueDimFunctor<DeviceContext, T>(
                          context, *x, out, axis, return_index, return_inverse,
                          return_counts));
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index cd8b31d72e72a..e9bc351de4d69 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -234,6 +234,7 @@ register_unity_group(cc
     save_combine_op.cc
     save_op.cc
     scale_op.cc
+    mkldnn/scale_mkldnn_op.cc
     scatter_nd_add_op.cc
     scatter_op.cc
     seed_op.cc
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 0827d6a5ae764..12a54fd7e87f4 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -187,10 +187,12 @@ endif()
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
 cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
+cc_test(complex_test SRCS complex_test.cc DEPS lod_tensor)
 
 IF(WITH_GPU)
   nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
   nv_test(bfloat16_gpu_test SRCS bfloat16_test.cu DEPS lod_tensor)
+  nv_test(complex_gpu_test SRCS complex_test.cu DEPS lod_tensor)
   nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
   nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
 ENDIF()
diff --git a/paddle/fluid/platform/complex.h b/paddle/fluid/platform/complex.h
new file mode 100644
index 0000000000000..2c1b42ea4882d
--- /dev/null
+++ b/paddle/fluid/platform/complex.h
@@ -0,0 +1,537 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <complex>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#ifdef PADDLE_WITH_CUDA
+#include <cuComplex.h>
+#include <thrust/complex.h>
+#endif  // PADDLE_WITH_CUDA
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_complex.h>
+#include <thrust/complex.h>  // NOLINT
+#endif
+
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) __declspec(align(x))
+#endif
+
+#if (defined(__CUDACC__) || defined(__HIPCC__))
+#define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// todo
+#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX
+#endif
+
+namespace paddle {
+namespace platform {
+
+template <typename T>
+struct PADDLE_ALIGN(sizeof(T) * 2) complex {
+ public:
+  T real;
+  T imag;
+
+  complex() = default;
+  complex(const complex<T>& o) = default;
+  complex& operator=(const complex<T>& o) = default;
+  complex(complex<T>&& o) = default;
+  complex& operator=(complex<T>&& o) = default;
+  ~complex() = default;
+
+  HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+  template <typename T1>
+  HOSTDEVICE inline explicit complex(const thrust::complex<T1>& c) {
+    real = c.real();
+    imag = c.imag();
+  }
+
+  template <typename T1>
+  HOSTDEVICE inline explicit operator thrust::complex<T1>() const {
+    return thrust::complex<T1>(real, imag);
+  }
+
+#ifdef PADDLE_WITH_HIP
+  HOSTDEVICE inline explicit operator hipFloatComplex() const {
+    return make_hipFloatComplex(real, imag);
+  }
+
+  HOSTDEVICE inline explicit operator hipDoubleComplex() const {
+    return make_hipDoubleComplex(real, imag);
+  }
+#else
+  HOSTDEVICE inline explicit operator cuFloatComplex() const {
+    return make_cuFloatComplex(real, imag);
+  }
+
+  HOSTDEVICE inline explicit operator cuDoubleComplex() const {
+    return make_cuDoubleComplex(real, imag);
+  }
+#endif
+#endif
+
+  template <typename T1,
+            typename std::enable_if<std::is_floating_point<T1>::value ||
+                                        std::is_integral<T1>::value,
+                                    int>::type = 0>
+  HOSTDEVICE complex(const T1& val) {
+    real = static_cast<T>(val);
+    imag = static_cast<T>(0.0);
+  }
+
+  template <typename T1 = T>
+  HOSTDEVICE explicit complex(
+      const std::enable_if_t<std::is_same<T1, float>::value, complex<double>>&
+          val) {
+    real = val.real;
+    imag = val.imag;
+  }
+
+  template <typename T1 = T>
+  HOSTDEVICE explicit complex(
+      const std::enable_if_t<std::is_same<T1, double>::value, complex<float>>&
+          val) {
+    real = val.real;
+    imag = val.imag;
+  }
+
+  template <typename T1>
+  HOSTDEVICE inline explicit operator std::complex<T1>() const {
+    return static_cast<std::complex<T1>>(std::complex<T>(real, imag));
+  }
+
+  template <typename T1>
+  HOSTDEVICE complex(const std::complex<T1>& val)
+      : real(val.real()), imag(val.imag()) {}
+
+  template <typename T1,
+            typename std::enable_if<std::is_floating_point<T1>::value ||
+                                        std::is_integral<T1>::value,
+                                    int>::type = 0>
+  HOSTDEVICE inline complex& operator=(const T1& val) {
+    real = static_cast<T>(val);
+    imag = static_cast<T>(0.0);
+    return *this;
+  }
+
+  HOSTDEVICE inline explicit operator bool() const {
+    return static_cast<bool>(this->real) || static_cast<bool>(this->imag);
+  }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator float() const {
+    return static_cast<float>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(this->real);
+  }
+};
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator+(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) + thrust::complex<T>(b));
+#else
+  return complex<T>(a.real + b.real, a.imag + b.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator-(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) - thrust::complex<T>(b));
+#else
+  return complex<T>(a.real - b.real, a.imag - b.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator*(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) * thrust::complex<T>(b));
+#else
+  return complex<T>(a.real * b.real - a.imag * b.imag,
+                    a.imag * b.real + b.imag * a.real);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator/(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) / thrust::complex<T>(b));
+#else
+  T denominator = b.real * b.real + b.imag * b.imag;
+  return complex<T>((a.real * b.real + a.imag * b.imag) / denominator,
+                    (a.imag * b.real - a.real * b.imag) / denominator);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator-(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(-thrust::complex<T>(a.real, a.imag));
+#else
+  complex<T> res;
+  res.real = -a.real;
+  res.imag = -a.imag;
+  return res;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator+=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) +=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  a.real += b.real;
+  a.imag += b.imag;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator-=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) -=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  a.real -= b.real;
+  a.imag -= b.imag;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator*=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) *=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  a.real = a.real * b.real - a.imag * b.imag;
+  a.imag = a.imag * b.real + b.imag * a.real;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator/=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) /=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  T denominator = b.real * b.real + b.imag * b.imag;
+  a.real = (a.real * b.real + a.imag * b.imag) / denominator;
+  a.imag = (a.imag * b.real - a.real * b.imag) / denominator;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> raw_uint16_to_complex64(uint16_t a) {
+  complex<T> res;
+  res.real = a;
+  res.imag = 0.0;
+  return res;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator==(const complex<T>& a, const complex<T>& b) {
+  return a.real == b.real && a.imag == b.imag;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator!=(const complex<T>& a, const complex<T>& b) {
+  return a.real != b.real || a.imag != b.imag;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator<(const complex<T>& a, const complex<T>& b) {
+  return a.real < b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator<=(const complex<T>& a, const complex<T>& b) {
+  return a.real <= b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator>(const complex<T>& a, const complex<T>& b) {
+  return a.real > b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator>=(const complex<T>& a, const complex<T>& b) {
+  return a.real >= b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> max(const complex<T>& a, const complex<T>& b) {
+  return (a.real >= b.real) ? a : b;
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> min(const complex<T>& a, const complex<T>& b) {
+  return (a.real < b.real) ? a : b;
+}
+
+template <typename T>
+HOSTDEVICE inline bool(isnan)(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return ::isnan(a.real) || ::isnan(a.imag);
+#else
+  return std::isnan(a.real) || std::isnan(a.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline bool isinf(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return ::isinf(a.real) || ::isinf(a.imag);
+#else
+  return std::isinf(a.real) || std::isinf(a.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline bool isfinite(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return ::isfinite(a.real) || ::isfinite(a.imag);
+#else
+  return std::isfinite(a.real) || std::isfinite(a.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline T abs(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return thrust::abs(thrust::complex<T>(a));
+#else
+  return std::abs(std::complex<T>(a));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> pow(const complex<T>& a, const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::pow(thrust::complex<T>(a), thrust::complex<T>(b)));
+#else
+  return complex<T>(std::pow(std::complex<T>(a), std::complex<T>(b)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> sqrt(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::sqrt(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::sqrt(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> tanh(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::tanh(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::tanh(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> log(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::log(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::log(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const complex<T>& a) {
+  os << "real:" << a.real << " imag:" << a.imag;
+  return os;
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+namespace std {
+
+template <typename T>
+struct is_pod<paddle::platform::complex<T>> {
+  static const bool value = true;
+};
+
+template <typename T>
+struct is_floating_point<paddle::platform::complex<T>>
+    : std::integral_constant<bool, false> {};
+
+template <typename T>
+struct is_signed<paddle::platform::complex<T>> {
+  static const bool value = false;
+};
+
+template <typename T>
+struct is_unsigned<paddle::platform::complex<T>> {
+  static const bool value = false;
+};
+
+template <typename T>
+inline bool isnan(const paddle::platform::complex<T>& a) {
+  return paddle::platform::isnan(a);
+}
+
+template <typename T>
+inline bool isinf(const paddle::platform::complex<T>& a) {
+  return paddle::platform::isinf(a);
+}
+
+template <typename T>
+struct numeric_limits<paddle::platform::complex<T>> {
+  static const bool is_specialized = false;
+  static const bool is_signed = false;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = false;
+  static const bool has_quiet_NaN = false;
+  static const bool has_signaling_NaN = false;
+  static const float_denorm_style has_denorm = denorm_absent;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_toward_zero;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 0;
+  static const int digits10 = 0;
+  static const int max_digits10 = 0;
+  static const int radix = 0;
+  static const int min_exponent = 0;
+  static const int min_exponent10 = 0;
+  static const int max_exponent = 0;
+  static const int max_exponent10 = 0;
+  static const bool traps = false;
+  static const bool tinyness_before = false;
+
+  static paddle::platform::complex<T> min() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> lowest() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> max() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> epsilon() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> round_error() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> infinity() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> quiet_NaN() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> signaling_NaN() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+  static paddle::platform::complex<T> denorm_min() {
+    return paddle::platform::complex<T>(0.0, 0.0);
+  }
+};
+
+}  // namespace std
diff --git a/paddle/fluid/platform/complex_test.cc b/paddle/fluid/platform/complex_test.cc
new file mode 100644
index 0000000000000..4d13161e94faf
--- /dev/null
+++ b/paddle/fluid/platform/complex_test.cc
@@ -0,0 +1,324 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/complex.h"
+#include <complex>
+#include "paddle/fluid/platform/eigen_ext.h"
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+TEST(complex, conversion_cpu) {
+  // *********** complex<float> *************
+  // float to complex<float>
+  EXPECT_EQ(complex<float>().real, 0.0f);
+  EXPECT_EQ(complex<float>().imag, 0.0f);
+
+  EXPECT_EQ(complex<float>(1.0f, 1.0f).real, 1.0f);
+  EXPECT_EQ(complex<float>(1.0f, 1.0f).imag, 1.0f);
+  EXPECT_EQ(complex<float>(0.0f, 1.0f).real, 0.0f);
+  EXPECT_EQ(complex<float>(0.0f, 1.0f).imag, 1.0f);
+
+  EXPECT_EQ(complex<float>(1.0f).real, 1.0f);
+  EXPECT_EQ(complex<float>(1.0f).imag, 0.0f);
+
+  // int to complex<float>
+  EXPECT_EQ(complex<float>(1).real, 1.0f);
+  EXPECT_EQ(complex<float>(0).real, 0.0f);
+  EXPECT_EQ(complex<float>(2).real, 2.0f);
+  EXPECT_EQ(complex<float>(-2).real, -2.0f);
+
+  // bool to complex
+  EXPECT_EQ(complex<float>(true).real, 1.0f);
+  EXPECT_EQ(complex<float>(true).imag, 0.0f);
+
+  // complex<double> to complex<float>
+  EXPECT_EQ(complex<float>(complex<double>(1.0, 2.0)).real, 1.0f);
+  EXPECT_EQ(complex<float>(complex<double>(1.0, 2.0)).imag, 2.0f);
+
+  // std::complex<float> to complex<float>
+  EXPECT_EQ(complex<float>(std::complex<float>(1.0f, 2.0f)).real, 1.0f);
+  EXPECT_EQ(complex<float>(std::complex<float>(1.0f, 2.0f)).imag, 2.0f);
+  EXPECT_EQ(complex<float>(std::complex<double>(1.0, 2.0)).real, 1.0f);
+  EXPECT_EQ(complex<float>(std::complex<double>(1.0, 2.0)).imag, 2.0f);
+
+  // Assignment operator
+  complex<float> c = 1.0f;
+  EXPECT_EQ(c.real, 1.0f);
+  EXPECT_EQ(c.imag, 0.0f);
+  c = complex<float>(2.0, 2.0);
+  EXPECT_EQ(c.real, 2.0f);
+  EXPECT_EQ(c.imag, 2.0f);
+
+  // Conversion operator
+  EXPECT_EQ(static_cast<float>(complex<float>(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(complex<float>(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(complex<float>(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(complex<float>(true)), true);
+
+  // *********** complex<double> *************
+  // double to complex<double>
+  EXPECT_EQ(complex<double>().real, 0.0);
+  EXPECT_EQ(complex<double>().imag, 0.0);
+
+  EXPECT_EQ(complex<double>(1.0, 1.0).real, 1.0);
+  EXPECT_EQ(complex<double>(1.0, 1.0).imag, 1.0);
+  EXPECT_EQ(complex<double>(0.0, 1.0).real, 0.0);
+  EXPECT_EQ(complex<double>(0.0, 1.0).imag, 1.0);
+
+  EXPECT_EQ(complex<double>(1.0).real, 1.0);
+  EXPECT_EQ(complex<double>(1.0).imag, 0.0);
+
+  // int to complex<double>
+  EXPECT_EQ(complex<double>(1).real, 1.0);
+  EXPECT_EQ(complex<double>(0).real, 0.0);
+  EXPECT_EQ(complex<double>(2).real, 2.0);
+  EXPECT_EQ(complex<double>(-2).real, -2.0);
+
+  // bool to complex
+  EXPECT_EQ(complex<double>(true).real, 1.0);
+  EXPECT_EQ(complex<double>(true).imag, 0.0);
+
+  // complex<float> to complex<double>
+  EXPECT_EQ(complex<double>(complex<float>(1.0f, 2.0f)).real, 1.0);
+  EXPECT_EQ(complex<double>(complex<float>(1.0f, 2.0f)).imag, 2.0);
+
+  // std::complex<float> to complex<double>
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).real, 1.0);
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).imag, 2.0);
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).real, 1.0);
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).imag, 2.0);
+
+  // Assignment operator
+  complex<double> c1 = 1.0;
+  EXPECT_EQ(c1.real, 1.0);
+  EXPECT_EQ(c1.imag, 0.0);
+  c1 = complex<double>(2.0, 2.0);
+  EXPECT_EQ(c1.real, 2.0);
+  EXPECT_EQ(c1.imag, 2.0);
+
+  // Conversion operator
+  EXPECT_EQ(static_cast<double>(complex<double>(0.5)), 0.5);
+  EXPECT_NEAR(static_cast<double>(complex<double>(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(complex<double>(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(complex<double>(true)), true);
+}
+
+TEST(bfloat16, comparison_cpu) {
+  // *********** complex<float> *************
+  EXPECT_TRUE(complex<float>(1.0f) == complex<float>(1.0f));
+  EXPECT_TRUE(complex<float>(1.0f, 2.0f) == complex<float>(1.0f, 2.0f));
+  EXPECT_FALSE(complex<float>(-1.0f) == complex<float>(-0.5f));
+  EXPECT_TRUE(complex<float>(1.0f) != complex<float>(0.5f));
+  EXPECT_FALSE(complex<float>(-1.0f) != complex<float>(-1.0f));
+  EXPECT_TRUE(complex<float>(1.0f) < complex<float>(2.0f));
+  EXPECT_FALSE(complex<float>(-1.0f) < complex<float>(-1.0f));
+  EXPECT_TRUE(complex<float>(1.0f) <= complex<float>(1.0f));
+  EXPECT_TRUE(complex<float>(2.0f) > complex<float>(1.0f));
+  EXPECT_FALSE(complex<float>(-2.0f) > complex<float>(-2.0f));
+  EXPECT_TRUE(complex<float>(2.0f) >= complex<float>(2.0f));
+
+  // *********** complex<double> *************
+  EXPECT_TRUE(complex<double>(1.0) == complex<double>(1.0));
+  EXPECT_TRUE(complex<double>(1.0, 2.0) == complex<double>(1.0, 2.0));
+  EXPECT_FALSE(complex<double>(-1.0) == complex<double>(-0.5f));
+  EXPECT_TRUE(complex<double>(1.0) != complex<double>(0.5f));
+  EXPECT_FALSE(complex<double>(-1.0) != complex<double>(-1.0));
+  EXPECT_TRUE(complex<double>(1.0) < complex<double>(2.0));
+  EXPECT_FALSE(complex<double>(-1.0) < complex<double>(-1.0));
+  EXPECT_TRUE(complex<double>(1.0) <= complex<double>(1.0));
+  EXPECT_TRUE(complex<double>(2.0) > complex<double>(1.0));
+  EXPECT_FALSE(complex<double>(-2.0) > complex<double>(-2.0));
+  EXPECT_TRUE(complex<double>(2.0) >= complex<double>(2.0));
+}
+
+TEST(complex, arithmetic_cpu) {
+  // *********** complex<float> *************
+  complex<float> a = complex<float>(1, 1) + complex<float>(1, 1);
+  EXPECT_NEAR(a.real, 2, 0.001);
+  EXPECT_NEAR(a.imag, 2, 0.001);
+
+  complex<float> b = complex<float>(-5, -5) + complex<float>(5, 5);
+  EXPECT_EQ(b.real, 0);
+  EXPECT_EQ(b.imag, 0);
+
+  complex<float> c =
+      complex<float>(0.33333f, 0.33333f) + complex<float>(0.66667f, 0.66667f);
+  EXPECT_NEAR(c.real, 1.0f, 0.01);
+  EXPECT_NEAR(c.imag, 1.0f, 0.01);
+
+  complex<float> d = complex<float>(3) - complex<float>(5);
+  EXPECT_EQ(d.real, -2);
+  EXPECT_EQ(d.imag, 0);
+
+  complex<float> e =
+      complex<float>(0.66667f, 0.66667f) - complex<float>(0.33333f, 0.33333f);
+  EXPECT_NEAR(e.real, 0.33334f, 0.01);
+  EXPECT_NEAR(e.imag, 0.33334f, 0.01);
+
+  complex<float> f = complex<float>(0.33f, 0.33f) * complex<float>(0.2f, 0.2f);
+  EXPECT_NEAR(f.real, 0.0f, 0.01);
+  EXPECT_NEAR(f.imag, 0.132f, 0.01);
+
+  complex<float> g = complex<float>(0.33f, 0.33f) / complex<float>(0.2f, 0.2f);
+  EXPECT_NEAR(g.real, 1.65f, 0.01);
+  EXPECT_NEAR(g.imag, 0.0f, 0.01);
+
+  complex<float> h = -complex<float>(0.33f, 0.33f);
+  EXPECT_NEAR(h.real, -0.33f, 0.01);
+  EXPECT_NEAR(h.imag, -0.33f, 0.01);
+  h = -complex<float>(-0.33f, -0.33f);
+  EXPECT_NEAR(h.real, 0.33f, 0.01);
+  EXPECT_NEAR(h.imag, 0.33f, 0.01);
+
+  complex<float> i = complex<float>(1.0, 1.0);
+  i += complex<float>(2.0, 2.0);
+  EXPECT_NEAR(i.real, 3.0f, 0.01);
+  EXPECT_NEAR(i.imag, 3.0f, 0.01);
+  i -= complex<float>(1.0, 1.0);
+  EXPECT_NEAR(i.real, 2.0f, 0.01);
+  EXPECT_NEAR(i.imag, 2.0f, 0.01);
+  i *= complex<float>(3, 2);
+  EXPECT_NEAR(i.real, 2.0f, 0.01);
+  EXPECT_NEAR(i.imag, 10.0f, 0.01);
+  i /= complex<float>(3, 2);
+  EXPECT_NEAR(i.real, 2.0f, 0.01);
+  EXPECT_NEAR(i.imag, 2.0f, 0.01);
+
+  // *********** complex<double> *************
+  complex<double> a1 = complex<double>(1, 1) + complex<double>(1, 1);
+  EXPECT_NEAR(a1.real, 2, 0.001);
+  EXPECT_NEAR(a1.imag, 2, 0.001);
+
+  complex<double> b1 = complex<double>(-5, -5) + complex<double>(5, 5);
+  EXPECT_EQ(b1.real, 0);
+  EXPECT_EQ(b1.imag, 0);
+
+  complex<double> c1 =
+      complex<double>(0.33333f, 0.33333f) + complex<double>(0.66667f, 0.66667f);
+  EXPECT_NEAR(c1.real, 1.0f, 0.01);
+  EXPECT_NEAR(c1.imag, 1.0f, 0.01);
+
+  complex<double> d1 = complex<double>(3) - complex<double>(5);
+  EXPECT_EQ(d1.real, -2);
+  EXPECT_EQ(d1.imag, 0);
+
+  complex<double> e1 =
+      complex<double>(0.66667f, 0.66667f) - complex<double>(0.33333f, 0.33333f);
+  EXPECT_NEAR(e1.real, 0.33334f, 0.01);
+  EXPECT_NEAR(e1.imag, 0.33334f, 0.01);
+
+  complex<double> f1 =
+      complex<double>(0.33f, 0.33f) * complex<double>(0.2f, 0.2f);
+  EXPECT_NEAR(f1.real, 0.0f, 0.01);
+  EXPECT_NEAR(f1.imag, 0.132f, 0.01);
+
+  complex<double> g1 =
+      complex<double>(0.33f, 0.33f) / complex<double>(0.2f, 0.2f);
+  EXPECT_NEAR(g1.real, 1.65f, 0.01);
+  EXPECT_NEAR(g1.imag, 0.0f, 0.01);
+
+  complex<double> h1 = -complex<double>(0.33f, 0.33f);
+  EXPECT_NEAR(h1.real, -0.33f, 0.01);
+  EXPECT_NEAR(h1.imag, -0.33f, 0.01);
+  h1 = -complex<double>(-0.33f, -0.33f);
+  EXPECT_NEAR(h1.real, 0.33f, 0.01);
+  EXPECT_NEAR(h1.imag, 0.33f, 0.01);
+
+  complex<double> i1 = complex<double>(1.0, 1.0);
+  i1 += complex<double>(2.0, 2.0);
+  EXPECT_NEAR(i1.real, 3.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 3.0f, 0.01);
+  i1 -= complex<double>(1.0, 1.0);
+  EXPECT_NEAR(i1.real, 2.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 2.0f, 0.01);
+  i1 *= complex<double>(3, 2);
+  EXPECT_NEAR(i1.real, 2.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 10.0f, 0.01);
+  i1 /= complex<double>(3, 2);
+  EXPECT_NEAR(i1.real, 2.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 2.0f, 0.01);
+}
+
+TEST(complex, print) {
+  complex<float> a(1.0f);
+  std::cout << a << std::endl;
+
+  complex<double> b(1.0);
+  std::cout << b << std::endl;
+}
+
+TEST(complex, isinf) {
+  // *********** complex<float> *************
+  complex<float> a;
+  a.real = float(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  a.imag = float(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+
+  complex<float> b = float(INFINITY);
+  EXPECT_EQ(std::isinf(b), true);
+
+  complex<float> c(float(INFINITY), 0);
+  EXPECT_EQ(std::isinf(c), true);
+
+  // *********** complex<double> *************
+  complex<double> a1;
+  a1.real = double(INFINITY);
+  EXPECT_EQ(std::isinf(a1), true);
+  a1.imag = double(INFINITY);
+  EXPECT_EQ(std::isinf(a1), true);
+
+  complex<double> b1 = double(INFINITY);
+  EXPECT_EQ(std::isinf(b1), true);
+
+  complex<double> c1(double(INFINITY), 0);
+  EXPECT_EQ(std::isinf(c1), true);
+}
+
+TEST(complex, isnan) {
+  // *********** complex<float> *************
+  complex<float> a;
+  a.real = float(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  a.imag = float(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+
+  complex<float> b = float(NAN);
+  EXPECT_EQ(std::isnan(b), true);
+
+  complex<float> c(float(NAN), 0);
+  EXPECT_EQ(std::isnan(c), true);
+
+  // *********** complex<double> *************
+  complex<double> a1;
+  a1.real = double(NAN);
+  EXPECT_EQ(std::isnan(a1), true);
+  a1.imag = double(NAN);
+  EXPECT_EQ(std::isnan(a1), true);
+
+  complex<double> b1 = double(NAN);
+  EXPECT_EQ(std::isnan(b1), true);
+
+  complex<double> c1(double(NAN), 0);
+  EXPECT_EQ(std::isnan(c1), true);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/complex_test.cu b/paddle/fluid/platform/complex_test.cu
new file mode 100644
index 0000000000000..b46d1b7b271d7
--- /dev/null
+++ b/paddle/fluid/platform/complex_test.cu
@@ -0,0 +1,361 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/complex.h"
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <thrust/complex.h>
+#include <bitset>
+#include <iostream>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/eigen_ext.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+namespace paddle {
+namespace platform {
+
+TEST(complex, conversion_on_gpu) {
+  // *********** complex<float> *************
+  // thrust<float> from and to complex<float>
+  complex<float> a(1.0f, 2.0f);
+  EXPECT_EQ(complex<float>(thrust::complex<float>(a)).real, 1.0);
+  EXPECT_EQ(complex<float>(thrust::complex<float>(a)).imag, 2.0);
+
+  complex<double> a1(1.0, 2.0);
+  EXPECT_EQ(complex<double>(thrust::complex<double>(a1)).real, 1.0);
+  EXPECT_EQ(complex<double>(thrust::complex<double>(a1)).imag, 2.0);
+
+#if defined(PADDLE_WITH_HIP)
+  EXPECT_EQ(hipFloatComplex(a).real(), 1.0);
+  EXPECT_EQ(hipFloatComplex(a).imag(), 2.0);
+  EXPECT_EQ(hipDoubleComplex(a).real(), 1.0);
+  EXPECT_EQ(hipDoubleComplex(a).imag(), 2.0);
+
+  EXPECT_EQ(hipFloatComplex(a1).real(), 1.0);
+  EXPECT_EQ(hipFloatComplex(a1).imag(), 2.0);
+  EXPECT_EQ(hipDoubleComplex(a1).real(), 1.0);
+  EXPECT_EQ(hipDoubleComplex(a1).imag(), 2.0);
+#else
+  EXPECT_EQ(cuCrealf(cuFloatComplex(a)), 1.0);
+  EXPECT_EQ(cuCimagf(cuFloatComplex(a)), 2.0);
+  EXPECT_EQ(cuCreal(cuDoubleComplex(a)), 1.0);
+  EXPECT_EQ(cuCimag(cuDoubleComplex(a)), 2.0);
+
+  EXPECT_EQ(cuCrealf(cuFloatComplex(a1)), 1.0);
+  EXPECT_EQ(cuCimagf(cuFloatComplex(a1)), 2.0);
+  EXPECT_EQ(cuCreal(cuDoubleComplex(a1)), 1.0);
+  EXPECT_EQ(cuCimag(cuDoubleComplex(a1)), 2.0);
+#endif
+
+  EXPECT_EQ(complex<float>().real, 0.0f);
+  EXPECT_EQ(complex<float>().imag, 0.0f);
+
+  EXPECT_EQ(complex<float>(1.0f, 1.0f).real, 1.0f);
+  EXPECT_EQ(complex<float>(1.0f, 1.0f).imag, 1.0f);
+  EXPECT_EQ(complex<float>(0.0f, 1.0f).real, 0.0f);
+  EXPECT_EQ(complex<float>(0.0f, 1.0f).imag, 1.0f);
+
+  EXPECT_EQ(complex<float>(1.0f).real, 1.0f);
+  EXPECT_EQ(complex<float>(1.0f).imag, 0.0f);
+
+  // int to complex<float>
+  EXPECT_EQ(complex<float>(1).real, 1.0f);
+  EXPECT_EQ(complex<float>(0).real, 0.0f);
+  EXPECT_EQ(complex<float>(2).real, 2.0f);
+  EXPECT_EQ(complex<float>(-2).real, -2.0f);
+
+  // bool to complex
+  EXPECT_EQ(complex<float>(true).real, 1.0f);
+  EXPECT_EQ(complex<float>(true).imag, 0.0f);
+
+  // complex<double> to complex<float>
+  EXPECT_EQ(complex<float>(complex<double>(1.0, 2.0)).real, 1.0f);
+  EXPECT_EQ(complex<float>(complex<double>(1.0, 2.0)).imag, 2.0f);
+
+  // std::complex<float> to complex<float>
+  EXPECT_EQ(complex<float>(std::complex<float>(1.0f, 2.0f)).real, 1.0f);
+  EXPECT_EQ(complex<float>(std::complex<float>(1.0f, 2.0f)).imag, 2.0f);
+  EXPECT_EQ(complex<float>(std::complex<double>(1.0, 2.0)).real, 1.0f);
+  EXPECT_EQ(complex<float>(std::complex<double>(1.0, 2.0)).imag, 2.0f);
+
+  // Assignment operator
+  complex<float> c = 1.0f;
+  EXPECT_EQ(c.real, 1.0f);
+  EXPECT_EQ(c.imag, 0.0f);
+  c = complex<float>(2.0, 2.0);
+  EXPECT_EQ(c.real, 2.0f);
+  EXPECT_EQ(c.imag, 2.0f);
+
+  // Conversion operator
+  EXPECT_EQ(static_cast<float>(complex<float>(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(complex<float>(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(complex<float>(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(complex<float>(true)), true);
+
+  // *********** complex<double> *************
+  // double to complex<double>
+  EXPECT_EQ(complex<double>().real, 0.0);
+  EXPECT_EQ(complex<double>().imag, 0.0);
+
+  EXPECT_EQ(complex<double>(1.0, 1.0).real, 1.0);
+  EXPECT_EQ(complex<double>(1.0, 1.0).imag, 1.0);
+  EXPECT_EQ(complex<double>(0.0, 1.0).real, 0.0);
+  EXPECT_EQ(complex<double>(0.0, 1.0).imag, 1.0);
+
+  EXPECT_EQ(complex<double>(1.0).real, 1.0);
+  EXPECT_EQ(complex<double>(1.0).imag, 0.0);
+
+  // int to complex<double>
+  EXPECT_EQ(complex<double>(1).real, 1.0);
+  EXPECT_EQ(complex<double>(0).real, 0.0);
+  EXPECT_EQ(complex<double>(2).real, 2.0);
+  EXPECT_EQ(complex<double>(-2).real, -2.0);
+
+  // bool to complex
+  EXPECT_EQ(complex<double>(true).real, 1.0);
+  EXPECT_EQ(complex<double>(true).imag, 0.0);
+
+  // complex<float> to complex<double>
+  EXPECT_EQ(complex<double>(complex<float>(1.0f, 2.0f)).real, 1.0);
+  EXPECT_EQ(complex<double>(complex<float>(1.0f, 2.0f)).imag, 2.0);
+
+  // std::complex<float> to complex<double>
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).real, 1.0);
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).imag, 2.0);
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).real, 1.0);
+  EXPECT_EQ(complex<double>(std::complex<double>(1.0, 2.0)).imag, 2.0);
+
+  // Assignment operator
+  complex<double> c1 = 1.0;
+  EXPECT_EQ(c1.real, 1.0);
+  EXPECT_EQ(c1.imag, 0.0);
+  c1 = complex<double>(2.0, 2.0);
+  EXPECT_EQ(c1.real, 2.0);
+  EXPECT_EQ(c1.imag, 2.0);
+
+  // Conversion operator
+  EXPECT_EQ(static_cast<double>(complex<double>(0.5)), 0.5);
+  EXPECT_NEAR(static_cast<double>(complex<double>(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(complex<double>(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(complex<double>(true)), true);
+}
+
+TEST(bfloat16, comparison_cpu) {
+  // *********** complex<float> *************
+  EXPECT_TRUE(complex<float>(1.0f) == complex<float>(1.0f));
+  EXPECT_TRUE(complex<float>(1.0f, 2.0f) == complex<float>(1.0f, 2.0f));
+  EXPECT_FALSE(complex<float>(-1.0f) == complex<float>(-0.5f));
+  EXPECT_TRUE(complex<float>(1.0f) != complex<float>(0.5f));
+  EXPECT_FALSE(complex<float>(-1.0f) != complex<float>(-1.0f));
+  EXPECT_TRUE(complex<float>(1.0f) < complex<float>(2.0f));
+  EXPECT_FALSE(complex<float>(-1.0f) < complex<float>(-1.0f));
+  EXPECT_TRUE(complex<float>(1.0f) <= complex<float>(1.0f));
+  EXPECT_TRUE(complex<float>(2.0f) > complex<float>(1.0f));
+  EXPECT_FALSE(complex<float>(-2.0f) > complex<float>(-2.0f));
+  EXPECT_TRUE(complex<float>(2.0f) >= complex<float>(2.0f));
+
+  // *********** complex<double> *************
+  EXPECT_TRUE(complex<double>(1.0) == complex<double>(1.0));
+  EXPECT_TRUE(complex<double>(1.0, 2.0) == complex<double>(1.0, 2.0));
+  EXPECT_FALSE(complex<double>(-1.0) == complex<double>(-0.5f));
+  EXPECT_TRUE(complex<double>(1.0) != complex<double>(0.5f));
+  EXPECT_FALSE(complex<double>(-1.0) != complex<double>(-1.0));
+  EXPECT_TRUE(complex<double>(1.0) < complex<double>(2.0));
+  EXPECT_FALSE(complex<double>(-1.0) < complex<double>(-1.0));
+  EXPECT_TRUE(complex<double>(1.0) <= complex<double>(1.0));
+  EXPECT_TRUE(complex<double>(2.0) > complex<double>(1.0));
+  EXPECT_FALSE(complex<double>(-2.0) > complex<double>(-2.0));
+  EXPECT_TRUE(complex<double>(2.0) >= complex<double>(2.0));
+}
+
+TEST(complex, arithmetic_cpu) {
+  // *********** complex<float> *************
+  complex<float> a = complex<float>(1, 1) + complex<float>(1, 1);
+  EXPECT_NEAR(a.real, 2, 0.001);
+  EXPECT_NEAR(a.imag, 2, 0.001);
+
+  complex<float> b = complex<float>(-5, -5) + complex<float>(5, 5);
+  EXPECT_EQ(b.real, 0);
+  EXPECT_EQ(b.imag, 0);
+
+  complex<float> c =
+      complex<float>(0.33333f, 0.33333f) + complex<float>(0.66667f, 0.66667f);
+  EXPECT_NEAR(c.real, 1.0f, 0.01);
+  EXPECT_NEAR(c.imag, 1.0f, 0.01);
+
+  complex<float> d = complex<float>(3) - complex<float>(5);
+  EXPECT_EQ(d.real, -2);
+  EXPECT_EQ(d.imag, 0);
+
+  complex<float> e =
+      complex<float>(0.66667f, 0.66667f) - complex<float>(0.33333f, 0.33333f);
+  EXPECT_NEAR(e.real, 0.33334f, 0.01);
+  EXPECT_NEAR(e.imag, 0.33334f, 0.01);
+
+  complex<float> f = complex<float>(0.33f, 0.33f) * complex<float>(0.2f, 0.2f);
+  EXPECT_NEAR(f.real, 0.0f, 0.01);
+  EXPECT_NEAR(f.imag, 0.132f, 0.01);
+
+  complex<float> g = complex<float>(0.33f, 0.33f) / complex<float>(0.2f, 0.2f);
+  EXPECT_NEAR(g.real, 1.65f, 0.01);
+  EXPECT_NEAR(g.imag, 0.0f, 0.01);
+
+  complex<float> h = -complex<float>(0.33f, 0.33f);
+  EXPECT_NEAR(h.real, -0.33f, 0.01);
+  EXPECT_NEAR(h.imag, -0.33f, 0.01);
+  h = -complex<float>(-0.33f, -0.33f);
+  EXPECT_NEAR(h.real, 0.33f, 0.01);
+  EXPECT_NEAR(h.imag, 0.33f, 0.01);
+
+  complex<float> i = complex<float>(1.0, 1.0);
+  i += complex<float>(2.0, 2.0);
+  EXPECT_NEAR(i.real, 3.0f, 0.01);
+  EXPECT_NEAR(i.imag, 3.0f, 0.01);
+  i -= complex<float>(1.0, 1.0);
+  EXPECT_NEAR(i.real, 2.0f, 0.01);
+  EXPECT_NEAR(i.imag, 2.0f, 0.01);
+  i *= complex<float>(3, 2);
+  EXPECT_NEAR(i.real, 2.0f, 0.01);
+  EXPECT_NEAR(i.imag, 10.0f, 0.01);
+  i /= complex<float>(3, 2);
+  EXPECT_NEAR(i.real, 2.0f, 0.01);
+  EXPECT_NEAR(i.imag, 2.0f, 0.01);
+
+  // *********** complex<double> *************
+  complex<double> a1 = complex<double>(1, 1) + complex<double>(1, 1);
+  EXPECT_NEAR(a1.real, 2, 0.001);
+  EXPECT_NEAR(a1.imag, 2, 0.001);
+
+  complex<double> b1 = complex<double>(-5, -5) + complex<double>(5, 5);
+  EXPECT_EQ(b1.real, 0);
+  EXPECT_EQ(b1.imag, 0);
+
+  complex<double> c1 =
+      complex<double>(0.33333f, 0.33333f) + complex<double>(0.66667f, 0.66667f);
+  EXPECT_NEAR(c1.real, 1.0f, 0.01);
+  EXPECT_NEAR(c1.imag, 1.0f, 0.01);
+
+  complex<double> d1 = complex<double>(3) - complex<double>(5);
+  EXPECT_EQ(d1.real, -2);
+  EXPECT_EQ(d1.imag, 0);
+
+  complex<double> e1 =
+      complex<double>(0.66667f, 0.66667f) - complex<double>(0.33333f, 0.33333f);
+  EXPECT_NEAR(e1.real, 0.33334f, 0.01);
+  EXPECT_NEAR(e1.imag, 0.33334f, 0.01);
+
+  complex<double> f1 =
+      complex<double>(0.33f, 0.33f) * complex<double>(0.2f, 0.2f);
+  EXPECT_NEAR(f1.real, 0.0f, 0.01);
+  EXPECT_NEAR(f1.imag, 0.132f, 0.01);
+
+  complex<double> g1 =
+      complex<double>(0.33f, 0.33f) / complex<double>(0.2f, 0.2f);
+  EXPECT_NEAR(g1.real, 1.65f, 0.01);
+  EXPECT_NEAR(g1.imag, 0.0f, 0.01);
+
+  complex<double> h1 = -complex<double>(0.33f, 0.33f);
+  EXPECT_NEAR(h1.real, -0.33f, 0.01);
+  EXPECT_NEAR(h1.imag, -0.33f, 0.01);
+  h1 = -complex<double>(-0.33f, -0.33f);
+  EXPECT_NEAR(h1.real, 0.33f, 0.01);
+  EXPECT_NEAR(h1.imag, 0.33f, 0.01);
+
+  complex<double> i1 = complex<double>(1.0, 1.0);
+  i1 += complex<double>(2.0, 2.0);
+  EXPECT_NEAR(i1.real, 3.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 3.0f, 0.01);
+  i1 -= complex<double>(1.0, 1.0);
+  EXPECT_NEAR(i1.real, 2.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 2.0f, 0.01);
+  i1 *= complex<double>(3, 2);
+  EXPECT_NEAR(i1.real, 2.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 10.0f, 0.01);
+  i1 /= complex<double>(3, 2);
+  EXPECT_NEAR(i1.real, 2.0f, 0.01);
+  EXPECT_NEAR(i1.imag, 2.0f, 0.01);
+}
+
+TEST(complex, print) {
+  complex<float> a(1.0f);
+  std::cout << a << std::endl;
+
+  complex<double> b(1.0);
+  std::cout << b << std::endl;
+}
+
+TEST(complex, isinf) {
+  // *********** complex<float> *************
+  complex<float> a;
+  a.real = float(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  a.imag = float(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+
+  complex<float> b = float(INFINITY);
+  EXPECT_EQ(std::isinf(b), true);
+
+  complex<float> c(float(INFINITY), 0);
+  EXPECT_EQ(std::isinf(c), true);
+
+  // *********** complex<double> *************
+  complex<double> a1;
+  a1.real = double(INFINITY);
+  EXPECT_EQ(std::isinf(a1), true);
+  a1.imag = double(INFINITY);
+  EXPECT_EQ(std::isinf(a1), true);
+
+  complex<double> b1 = double(INFINITY);
+  EXPECT_EQ(std::isinf(b1), true);
+
+  complex<double> c1(double(INFINITY), 0);
+  EXPECT_EQ(std::isinf(c1), true);
+}
+
+TEST(complex, isnan) {
+  // *********** complex<float> *************
+  complex<float> a;
+  a.real = float(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  a.imag = float(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+
+  complex<float> b = float(NAN);
+  EXPECT_EQ(std::isnan(b), true);
+
+  complex<float> c(float(NAN), 0);
+  EXPECT_EQ(std::isnan(c), true);
+
+  // *********** complex<double> *************
+  complex<double> a1;
+  a1.real = double(NAN);
+  EXPECT_EQ(std::isnan(a1), true);
+  a1.imag = double(NAN);
+  EXPECT_EQ(std::isnan(a1), true);
+
+  complex<double> b1 = double(NAN);
+  EXPECT_EQ(std::isnan(b1), true);
+
+  complex<double> c1(double(NAN), 0);
+  EXPECT_EQ(std::isnan(c1), true);
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
\ No newline at end of file
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 923c97350e89e..6405b55621766 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -104,6 +104,23 @@ size_t CUDAPinnedMaxChunkSize() {
   return CUDAPinnedMaxAllocSize() / 256;
 }
 
+size_t NPUPinnedMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
+  return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+size_t NPUPinnedMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 64 KB.
+  return 1 << 16;
+}
+
+size_t NPUPinnedMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 1/256 of NPU_PINNED
+  // memory.
+  return NPUPinnedMaxAllocSize() / 256;
+}
+
 #ifdef PADDLE_WITH_XBYAK
 static Xbyak::util::Cpu cpu;
 bool MayIUse(const cpu_isa_t cpu_isa) {
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 94527149d4e0b..29dc0a15aaea1 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -73,6 +73,15 @@ size_t CUDAPinnedMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CUDAPinnedMaxChunkSize();
 
+//! Get the maximum allocation size for a machine.
+size_t NPUPinnedMaxAllocSize();
+
+//! Get the minimum chunk size for buddy allocator.
+size_t NPUPinnedMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t NPUPinnedMaxChunkSize();
+
 typedef enum {
   isa_any,
   sse42,
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 9a47ac45462ed..7e983eb54ae2c 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -153,6 +153,16 @@ DeviceContextPool::DeviceContextPool(
       PADDLE_THROW(platform::errors::Unimplemented(
           "NPUPlace is not supported. Please "
           "re-compile with WITH_ASCEND_CL option."));
+#endif
+    } else if (platform::is_npu_pinned_place(p)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      EmplaceDeviceContext<NPUPinnedDeviceContext, NPUPinnedPlace>(
+          &device_contexts_, p);
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "NPUPinnedPlace is not supported. Please re-compile with "
+          "WITH_ASCEND_CL "
+          "option."));
 #endif
     }
   }
@@ -264,6 +274,22 @@ aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
 Place NPUDeviceContext::GetPlace() const { return place_; }
 
 aclrtContext NPUDeviceContext::context() const { return context_; }
+
+NPUPinnedDeviceContext::NPUPinnedDeviceContext() {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+NPUPinnedDeviceContext::NPUPinnedDeviceContext(NPUPinnedPlace place)
+    : place_(place) {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+Place NPUPinnedDeviceContext::GetPlace() const { return place_; }
+
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index d91e14ec3aa92..e62f0673e97fa 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -233,6 +233,27 @@ template <>
 struct DefaultDeviceContextType<platform::NPUPlace> {
   using TYPE = NPUDeviceContext;
 };
+
+// Currently, NPUPinnedDeviceContext is only used to data copying.
+class NPUPinnedDeviceContext : public DeviceContext {
+ public:
+  NPUPinnedDeviceContext();
+  explicit NPUPinnedDeviceContext(NPUPinnedPlace place);
+
+  Place GetPlace() const override;
+
+  Eigen::DefaultDevice* eigen_device() const;
+
+ private:
+  NPUPinnedPlace place_;
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+};
+
+template <>
+struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
+  using TYPE = NPUPinnedDeviceContext;
+};
+
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 8bff2ead0a2a3..21d9e8607459a 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,6 +1,10 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
-list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc nvjpeg.cc)
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc)
+
+if (NOT WITH_NV_JETSON)
+    list(APPEND CUDA_SRCS nvjpeg.cc)
+endif()
 
 if (WITH_ROCM)
   list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc)
diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h
index 0db4cc71b1b21..4eea87e909d1b 100644
--- a/paddle/fluid/platform/eigen_ext.h
+++ b/paddle/fluid/platform/eigen_ext.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
@@ -27,6 +28,8 @@ namespace Eigen {
 using complex64 = paddle::platform::complex64;
 using complex128 = paddle::platform::complex128;
 using float16 = paddle::platform::float16;
+template <typename T>
+using complex = paddle::platform::complex<T>;
 
 template <typename T>
 struct NumTraits;
@@ -105,6 +108,50 @@ struct NumTraits<complex128> : GenericNumTraits<std::complex<double>> {
   static inline int digits10() { return NumTraits<Real>::digits10(); }
 };
 
+template <>
+struct NumTraits<complex<float>> : GenericNumTraits<std::complex<float>> {
+  typedef float Real;
+  typedef typename NumTraits<float>::Literal Literal;
+  enum {
+    IsComplex = 1,
+    RequireInitialization = NumTraits<float>::RequireInitialization,
+    ReadCost = 2 * NumTraits<float>::ReadCost,
+    AddCost = 2 * NumTraits<Real>::AddCost,
+    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
+  };
+
+  EIGEN_DEVICE_FUNC
+  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
+  EIGEN_DEVICE_FUNC
+  static inline Real dummy_precision() {
+    return NumTraits<Real>::dummy_precision();
+  }
+  EIGEN_DEVICE_FUNC
+  static inline int digits10() { return NumTraits<Real>::digits10(); }
+};
+
+template <>
+struct NumTraits<complex<double>> : GenericNumTraits<std::complex<double>> {
+  typedef double Real;
+  typedef typename NumTraits<double>::Literal Literal;
+  enum {
+    IsComplex = 1,
+    RequireInitialization = NumTraits<double>::RequireInitialization,
+    ReadCost = 2 * NumTraits<double>::ReadCost,
+    AddCost = 2 * NumTraits<Real>::AddCost,
+    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
+  };
+
+  EIGEN_DEVICE_FUNC
+  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
+  EIGEN_DEVICE_FUNC
+  static inline Real dummy_precision() {
+    return NumTraits<Real>::dummy_precision();
+  }
+  EIGEN_DEVICE_FUNC
+  static inline int digits10() { return NumTraits<Real>::digits10(); }
+};
+
 template <>
 struct NumTraits<float16> : GenericNumTraits<float16> {
   enum {
@@ -354,6 +401,138 @@ HOSTDEVICE inline double abs(const complex128& a) {
   return paddle::platform::abs(a);
 }
 
+//////////// complex<float> methods /////////////
+
+template <>
+HOSTDEVICE inline bool(isnan)(const complex<float>& a) {
+  return (paddle::platform::isnan)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isinf)(const complex<float>& a) {
+  return (paddle::platform::isinf)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isfinite)(const complex<float>& a) {
+  return (paddle::platform::isfinite)(a);
+}
+
+template <>
+HOSTDEVICE inline complex<float> exp(const complex<float>& a) {
+  float com = ::expf(a.real);
+  float res_real = com * ::cosf(a.imag);
+  float res_imag = com * ::sinf(a.imag);
+  return complex<float>(res_real, res_imag);
+}
+
+template <>
+HOSTDEVICE inline complex<float> log(const complex<float>& a) {
+  return paddle::platform::log(a);
+}
+
+template <>
+HOSTDEVICE inline complex<float> tanh(const complex<float>& a) {
+  return paddle::platform::tanh(a);
+}
+
+template <>
+HOSTDEVICE inline complex<float> sqrt(const complex<float>& a) {
+  return paddle::platform::sqrt(a);
+}
+
+template <>
+HOSTDEVICE inline complex<float> ceil(const complex<float>& a) {
+  return complex<float>(::ceilf(a.real), ::ceilf(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex<float> floor(const complex<float>& a) {
+  return complex<float>(::floorf(a.real), ::floor(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex<float> round(const complex<float>& a) {
+  return complex<float>(::roundf(a.real), ::roundf(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex<float> pow(const complex<float>& a,
+                                     const complex<float>& b) {
+  return paddle::platform::pow(a, b);
+}
+
+template <>
+HOSTDEVICE inline float abs(const complex<float>& a) {
+  return paddle::platform::abs(a);
+}
+
+//////////// complex<double> methods /////////////
+
+template <>
+HOSTDEVICE inline bool(isnan)(const complex<double>& a) {
+  return (paddle::platform::isnan)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isinf)(const complex<double>& a) {
+  return (paddle::platform::isinf)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isfinite)(const complex<double>& a) {
+  return (paddle::platform::isfinite)(a);
+}
+
+template <>
+HOSTDEVICE inline complex<double> exp(const complex<double>& a) {
+  double com = ::expf(a.real);
+  double res_real = com * ::cosf(a.imag);
+  double res_imag = com * ::sinf(a.imag);
+  return complex<double>(res_real, res_imag);
+}
+
+template <>
+HOSTDEVICE inline complex<double> log(const complex<double>& a) {
+  return paddle::platform::log(a);
+}
+
+template <>
+HOSTDEVICE inline complex<double> tanh(const complex<double>& a) {
+  return paddle::platform::tanh(a);
+}
+
+template <>
+HOSTDEVICE inline complex<double> sqrt(const complex<double>& a) {
+  return paddle::platform::sqrt(a);
+}
+
+template <>
+HOSTDEVICE inline complex<double> ceil(const complex<double>& a) {
+  return complex<double>(::ceilf(a.real), ::ceilf(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex<double> floor(const complex<double>& a) {
+  return complex<double>(::floorf(a.real), ::floor(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex<double> round(const complex<double>& a) {
+  return complex<double>(::roundf(a.real), ::roundf(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex<double> pow(const complex<double>& a,
+                                      const complex<double>& b) {
+  return paddle::platform::pow(a, b);
+}
+
+template <>
+HOSTDEVICE inline double abs(const complex<double>& a) {
+  return paddle::platform::abs(a);
+}
+
 //////////// float16 methods /////////////
 
 template <>
diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h
new file mode 100644
index 0000000000000..5c5903d62cd27
--- /dev/null
+++ b/paddle/fluid/platform/fast_divmod.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.1 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.1
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include "paddle/fluid/platform/hostdevice.h"
+
+#define INT_BITS 32
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Size>
+struct alignas(sizeof(T) * Size) CudaAlignedVector {
+  T val[Size];
+};
+
+struct FastDivMod {
+  // 1st value represents the result of input number divides by recorded divisor
+  // 2nd value represents the result of input number modulo by recorded divisor
+  using DivModT = CudaAlignedVector<uint32_t, 2>;
+
+  FastDivMod() {}
+  HOSTDEVICE FastDivMod(uint32_t d) : divisor(d) {
+    static_assert(sizeof(unsigned int) == 4,
+                  "Only Support 32-bit unsigned int.");
+
+    for (shift_val = 0; shift_val < INT_BITS; ++shift_val) {
+      auto shift_limit = 1 << shift_val;
+      if (shift_limit >= divisor) break;
+    }
+    uint64_t long_one = 1;
+    uint64_t temp_div =
+        ((long_one << INT_BITS) * ((long_one << shift_val) - divisor)) /
+            divisor +
+        1;
+    multiplier = temp_div;
+  }
+
+  __device__ __forceinline__ uint32_t Div(uint32_t n) const {
+    uint32_t t = __umulhi(n, multiplier);
+    return (t + n) >> shift_val;
+  }
+
+  __device__ __forceinline__ DivModT Divmod(uint32_t n) {
+    uint32_t q = Div(n);
+    DivModT result = {q, n - q * divisor};
+    return result;
+  }
+
+  int32_t divisor;
+  int32_t shift_val;
+  uint32_t multiplier;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index e584b849368e4..5ff6f893a8953 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -126,13 +126,20 @@ class MKLDNNHandlerT {
     return (dev_ctx_.GetBlob(key_p) != nullptr);
   }
 
+  bool isCachedNonBlocking() {
+    const std::string key_pd = key_ + "@fwd_pd";
+    fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
+        dev_ctx_.GetBlob(key_pd));
+
+    return (fwd_pd_ != nullptr);
+  }
+
   bool isBwdCached() {
-    const std::string key_pd = key_common_ + "@bwd_pd";
+    const std::string key_pd = key_ + "@bwd_pd";
     bwd_pd_ = std::static_pointer_cast<typename TBackward::primitive_desc>(
         dev_ctx_.GetBlob(key_pd));
 
-    const std::string key_p = key_ + "@bwd_p";
-    return (dev_ctx_.GetBlob(key_p) != nullptr);
+    return (bwd_pd_ != nullptr);
   }
 
   // If your primitive descriptor requires attributes, pass them as a
@@ -161,6 +168,20 @@ class MKLDNNHandlerT {
     }
   }
 
+  template <typename Arg, typename... Args>
+  void AcquireForwardPrimitiveDescriptorNonBlocking(Arg&& first_arg,
+                                                    Args&&... args) {
+    // This is used when we can recreate FWD PD in BWD so
+    // we do not need to pass FWD to BWD
+    const std::string key_pd = key_ + "@fwd_pd";
+    fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
+        dev_ctx_.GetBlob(key_pd));
+    if (fwd_pd_ == nullptr) {
+      CreateForwardPrimitiveDescriptor(first_arg, std::forward<Args>(args)...);
+      dev_ctx_.SetBlob(key_pd, fwd_pd_);
+    }
+  }
+
   // Using sfinae to specialise variadic function. Workaround for not having
   // if constexpr in C++ 11.
   template <class First, class... Args>
@@ -182,6 +203,8 @@ class MKLDNNHandlerT {
         std::make_shared<typename TForward::primitive_desc>(fwd_desc, engine_);
   }
 
+  // TODO(jczaja): After/if all ops can used xxxNonBlocking version
+  // then remove this one
   template <typename... Args>
   void AcquireBackwardPrimitiveDescriptor(Args&&... args) {
     const std::string key_fwd_pd = key_common_ + "@fwd_pd";
@@ -201,6 +224,25 @@ class MKLDNNHandlerT {
     }
   }
 
+  template <typename... Args>
+  void AcquireBackwardPrimitiveDescriptorNonBlocking(Args&&... args) {
+    // fwd_pd_ is set during grad by calling
+    // AcquireForwardPrimitiveDescriptorNonBlocking
+    PADDLE_ENFORCE_NOT_NULL(
+        fwd_pd_,
+        platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.",
+                                      key_ + "@fwd_pd"));
+    const std::string key_pd = key_ + "@bwd_pd";
+    bwd_pd_ = std::static_pointer_cast<typename TBackward::primitive_desc>(
+        dev_ctx_.GetBlob(key_pd));
+    if (bwd_pd_ == nullptr) {
+      auto bwd_desc = typename TBackward::desc(std::forward<Args>(args)...);
+      bwd_pd_ = std::make_shared<typename TBackward::primitive_desc>(
+          bwd_desc, engine_, *fwd_pd_);
+      dev_ctx_.SetBlob(key_pd, bwd_pd_);
+    }
+  }
+
   std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
       const std::string& suffix) {
     return std::static_pointer_cast<mkldnn::memory>(
@@ -781,82 +823,6 @@ class ActivationMKLDNNHandler
   }
 };
 
-template <typename T>
-class LRNMKLDNNHandler
-    : public MKLDNNHandlerT<T, mkldnn::lrn_forward, mkldnn::lrn_backward> {
- public:
-  LRNMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
-                   const platform::MKLDNNDeviceContext& dev_ctx,
-                   const mkldnn::engine mkldnn_engine,
-                   platform::Place cpu_place, const Tensor* input,
-                   const std::string& unique_name)
-
-      : platform::MKLDNNHandlerT<T, mkldnn::lrn_forward, mkldnn::lrn_backward>(
-            dev_ctx, mkldnn_engine, cpu_place,
-            platform::CreateKey(dev_ctx, framework::vectorize(input->dims()),
-                                unique_name)) {
-    if (!this->isCached()) {
-      const int n = ctx.Attr<int>("n");
-      // MKL-DNN implements LRN in a caffe way:
-      // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html
-      // Where sum of squares is divided by size of normalization window
-      // this is not the case for PaddlePaddle LRN.
-      // Hence we need to compensate for this diffrence by
-      // multipliing alpha by size of window(n)
-      const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
-      const float beta = ctx.Attr<float>("beta");
-      const float k = ctx.Attr<float>("k");
-      bool is_test = ctx.Attr<bool>("is_test");
-
-      auto dims = paddle::framework::vectorize(input->dims());
-
-      auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(),
-                                         input->format());
-
-      this->AcquireForwardPrimitiveDescriptor(
-          is_test ? mkldnn::prop_kind::forward_inference
-                  : mkldnn::prop_kind::forward_training,
-          mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k);
-    }
-  }
-
-  LRNMKLDNNHandler(const std::vector<int64_t>& dims, const int n,
-                   const float alpha, const float beta, const float k,
-                   const MKLDNNMemoryFormat fmt,
-                   const MKLDNNMemoryFormat diff_fmt,
-                   const platform::MKLDNNDeviceContext& dev_ctx,
-                   platform::Place cpu_place, const std::string& unique_name)
-
-      : platform::MKLDNNHandlerT<T, mkldnn::lrn_forward, mkldnn::lrn_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(dev_ctx, dims, unique_name)) {
-    auto src_md =
-        mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), fmt);
-    auto diff_md =
-        mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), diff_fmt);
-
-    this->AcquireBackwardPrimitiveDescriptor(
-        mkldnn::algorithm::lrn_across_channels, src_md, diff_md, n, alpha, beta,
-        k);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWorkspaceMemory(
-      framework::Tensor* workspace) {
-    T* ptr = workspace->mutable_data<T>(
-        this->place_, this->fwd_pd_->workspace_desc().get_size());
-    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(),
-                                            ptr, "@wrk_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireBackwardWorkspaceMemory(
-      const framework::Tensor* workspace) {
-    const T* workspace_data = workspace->data<T>();
-    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(),
-                                            to_void_cast<T>(workspace_data),
-                                            "@bwd-wrk_mem_p");
-  }
-};
-
 template <typename T>
 class TransposeMKLDNNHandler : public MKLDNNHandler {
  public:
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index 1cc9fd9fe7634..14c772d88897f 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -34,6 +34,7 @@ class PlacePrinter : public boost::static_visitor<> {
   }
   void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
   void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
+  void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; }
   void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
 
  private:
@@ -62,6 +63,10 @@ bool is_cuda_pinned_place(const Place &p) {
   return boost::apply_visitor(IsCUDAPinnedPlace(), p);
 }
 
+bool is_npu_pinned_place(const Place &p) {
+  return boost::apply_visitor(IsNPUPinnedPlace(), p);
+}
+
 bool places_are_same_class(const Place &p1, const Place &p2) {
   return p1.which() == p2.which();
 }
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index f20fac477d0ec..62d30ecc5ce2e 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -85,10 +85,19 @@ struct NPUPlace {
   int device;
 };
 
+struct NPUPinnedPlace {
+  NPUPinnedPlace() {}
+
+  inline bool operator==(const NPUPinnedPlace &) const { return true; }
+  inline bool operator!=(const NPUPinnedPlace &) const { return false; }
+  inline bool operator<(const NPUPinnedPlace &) const { return false; }
+};
+
 struct IsCUDAPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return true; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -97,6 +106,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return true; }
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -105,6 +115,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };
@@ -113,6 +124,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return true; }
   bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -121,15 +133,25 @@ struct IsNPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return true; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
+};
+
+struct IsNPUPinnedPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return true; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
 class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
-                                    CUDAPinnedPlace> {
+                                    CUDAPinnedPlace, NPUPinnedPlace> {
  private:
-  using PlaceBase =
-      boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace, CUDAPinnedPlace>;
+  using PlaceBase = boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
+                                   CUDAPinnedPlace, NPUPinnedPlace>;
 
  public:
   Place() = default;
@@ -139,6 +161,8 @@ class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
   Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
   Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
       : PlaceBase(cuda_pinned_place) {}
+  Place(const NPUPinnedPlace &npu_pinned_place)  // NOLINT
+      : PlaceBase(npu_pinned_place) {}
 
   bool operator<(const Place &place) const {
     return PlaceBase::operator<(static_cast<const PlaceBase &>(place));
@@ -155,6 +179,7 @@ bool is_xpu_place(const Place &);
 bool is_npu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
+bool is_npu_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
 bool is_same_place(const Place &, const Place &);
 
@@ -190,6 +215,17 @@ struct PlaceVisitorWrapper
 #endif
   }
 
+  typename Visitor::result_type operator()(
+      const NPUPinnedPlace &npu_pinned) const {
+#ifdef PADDLE_WITH_ASCEND_CL
+    return visitor_(npu_pinned);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with NPU. Cannot visit npu_pinned"));
+    return typename Visitor::result_type();
+#endif
+  }
+
   typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     return visitor_(cuda);
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index b30214e1d8355..5fcb1e30fbe67 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -73,6 +73,14 @@ if (WITH_CRYPTO)
   set(PYBIND_SRCS ${PYBIND_SRCS} crypto.cc)
 endif (WITH_CRYPTO)
 
+if (WITH_PSLIB)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      set(DISTRIBUTE_COMPILE_FLAGS
+              "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+  endif()
+  set_source_files_properties(heter_wrapper_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+endif(WITH_PSLIB)
 if (WITH_PSCORE)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
   set_source_files_properties(fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -120,14 +128,20 @@ if(WITH_PYTHON)
     else()
       set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
     endif()
+    file(TO_NATIVE_PATH ${op_function_generator_path} op_function_generator_path)
+    file(TO_NATIVE_PATH ${impl_file} impl_file)
+    file(TO_NATIVE_PATH ${tmp_impl_file} tmp_impl_file)
+
     file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat ""
     "set build_times=1\n"
     ":retry\n"
     "ECHO op_function_generator run %build_times% time\n"
-    "${op_function_generator_path}/op_function_generator ${impl_file}\n"
+    "if exist ${tmp_impl_file} del ${tmp_impl_file}\n"
+    "taskkill /f /im op_function_generator.exe 2>NUL\n"
+    "${op_function_generator_path}\\op_function_generator.exe ${tmp_impl_file}\n"
     "if %ERRORLEVEL% NEQ 0 (\n"
     "    set /a build_times=%build_times%+1\n"
-    "    if %build_times% GTR 5 (\n"
+    "    if %build_times% GEQ 3 (\n"
     "        exit /b 1\n"
     "    ) else (\n"
     "        goto :retry\n"
@@ -137,6 +151,8 @@ if(WITH_PYTHON)
 
     add_custom_command(TARGET op_function_generator POST_BUILD
           COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
+          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
+          COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
     )
 
     if(${CBLAS_PROVIDER} STREQUAL MKLML)
@@ -168,7 +184,7 @@ if(WITH_PYTHON)
               "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator"
               "${tmp_impl_file}"
           COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
-          COMMENT "copy_if_different ${impl_file}"
+          COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
           VERBATIM
     )
     if(WITH_MKL)
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 450c992d41118..ac1fab97644a1 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -469,6 +469,62 @@ static void ParseIndexingSlice(framework::LoDTensor *tensor, PyObject *_index,
   if (!PyTuple_Check(_index)) Py_DecRef(index);
 }
 
+template <typename P>
+static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,
+                        imperative::VarBase &dst, const P &dst_device,
+                        const bool blocking) {
+  if (dst.SharedVar()->IsEmpty()) {
+    VLOG(3) << "deep copy Variable from " << src->Name() << " to "
+            << dst.Name();
+    dst.SetPersistable(src->Persistable());
+    dst.SetDataType(src->DataType());
+    dst.SetType(src->Type());
+    dst.SetOverridedStopGradient(src->OverridedStopGradient());
+    if (!src->SharedVar()->IsEmpty()) {
+      if (src->Var().IsType<framework::LoDTensor>()) {
+        auto &src_tensor = src->Var().Get<framework::LoDTensor>();
+        auto *dst_tensor = dst.MutableVar()->GetMutable<framework::LoDTensor>();
+        dst_tensor->set_lod(src_tensor.lod());
+        framework::TensorCopy(src_tensor, dst_device, dst_tensor);
+        if (blocking) {
+          platform::DeviceContextPool::Instance().Get(dst_device)->Wait();
+          auto src_device = src_tensor.place();
+          if (!(src_device == dst_device)) {
+            platform::DeviceContextPool::Instance().Get(src_device)->Wait();
+          }
+        }
+      } else if (src->Var().IsType<framework::SelectedRows>()) {
+        auto &src_selected_rows = src->Var().Get<framework::SelectedRows>();
+        auto *dst_selected_rows =
+            dst.MutableVar()->GetMutable<framework::SelectedRows>();
+        dst_selected_rows->set_height(src_selected_rows.height());
+        dst_selected_rows->set_rows(src_selected_rows.rows());
+        framework::TensorCopy(src_selected_rows.value(), dst_device,
+                              dst_selected_rows->mutable_value());
+        if (blocking) {
+          platform::DeviceContextPool::Instance().Get(dst_device)->Wait();
+          auto src_device = src_selected_rows.value().place();
+          if (!(src_device == dst_device)) {
+            platform::DeviceContextPool::Instance().Get(src_device)->Wait();
+          }
+        }
+      }
+
+      if (!blocking) {
+        IncreaseVarbaseReferenceCountUntilCopyComplete(src, dst_device);
+      }
+
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The source Tensor(%s) can not copy when it is empty.", src->Name()));
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The destion Tensor(%s) can not copy when it is not empty.",
+        dst.Name()));
+  }
+}
+
 // Bind Methods
 void BindImperative(py::module *m_ptr) {
   auto &m = *m_ptr;
@@ -710,6 +766,13 @@ void BindImperative(py::module *m_ptr) {
                imperative::NameVarBaseMap ins = {{"Input", {self}}};
                imperative::NameVarBaseMap outs = {{"Out", {self}}};
 
+               PADDLE_ENFORCE_EQ(
+                   self->IsLeaf() && !self->OverridedStopGradient(), false,
+                   platform::errors::InvalidArgument(
+                       "Leaf Tensor (%s) that doesn't stop gradient can't use "
+                       "inplace strategy.",
+                       self->Name()));
+
                auto value_tensor =
                    value_obj.cast<std::shared_ptr<imperative::VarBase>>();
                ins.insert({"ValueTensor", {value_tensor}});
@@ -1639,6 +1702,12 @@ void BindImperative(py::module *m_ptr) {
             self.nrings_ = nrings;
           });
 
+  m.def("varbase_copy", &VarBaseCopy<platform::Place>);
+  m.def("varbase_copy", &VarBaseCopy<platform::CPUPlace>);
+  m.def("varbase_copy", &VarBaseCopy<platform::CUDAPlace>);
+  m.def("varbase_copy", &VarBaseCopy<platform::XPUPlace>);
+  m.def("varbase_copy", &VarBaseCopy<platform::CUDAPinnedPlace>);
+
   m.def(
       "dygraph_partial_grad",
       [](const std::vector<std::shared_ptr<imperative::VarBase>> &input_targets,
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 416361d06a996..586cbda7ccfc5 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -84,9 +84,9 @@ struct npy_format_descriptor<paddle::platform::bfloat16> {
   static constexpr auto name = _("bfloat16");
 };
 
-// we register paddle::platform::complex64 as numpy.complex64.
+// we register paddle::platform::complex<float> as numpy.complex64.
 template <>
-struct npy_format_descriptor<paddle::platform::complex64> {
+struct npy_format_descriptor<paddle::platform::complex<float>> {
   static py::dtype dtype() {
     handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_COMPLEX64);
     return reinterpret_borrow<py::dtype>(ptr);
@@ -103,9 +103,8 @@ struct npy_format_descriptor<paddle::platform::complex64> {
   static constexpr auto name = _("complext64");
 };
 
-// we register paddle::platform::complex128 as numpy.complex128.
 template <>
-struct npy_format_descriptor<paddle::platform::complex128> {
+struct npy_format_descriptor<paddle::platform::complex<double>> {
   static py::dtype dtype() {
     handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_COMPLEX128);
     return reinterpret_borrow<py::dtype>(ptr);
@@ -168,8 +167,8 @@ struct ValidDTypeToPyArrayChecker {
 
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::float16);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::bfloat16);
-DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex64);
-DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex128);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex<float>);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex<double>);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(float);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(double);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(bool);
@@ -188,9 +187,9 @@ inline std::string TensorDTypeToPyDTypeStr(
     } else if (std::is_same<T, platform::bfloat16>::value) {                \
       /* NumPy character code of uint16 due to no support for bfloat16 */   \
       return "H";                                                           \
-    } else if (std::is_same<T, platform::complex64>::value) {               \
+    } else if (std::is_same<T, platform::complex<float>>::value) {          \
       return "F";                                                           \
-    } else if (std::is_same<T, platform::complex128>::value) {              \
+    } else if (std::is_same<T, platform::complex<double>>::value) {         \
       return "D";                                                           \
     } else {                                                                \
       constexpr auto kIsValidDType = ValidDTypeToPyArrayChecker<T>::kValue; \
@@ -367,12 +366,14 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj,
   } else if (py::isinstance<py::array_t<paddle::platform::float16>>(array)) {
     SetTensorFromPyArrayT<paddle::platform::float16, P>(self, array, place,
                                                         zero_copy);
-  } else if (py::isinstance<py::array_t<paddle::platform::complex64>>(array)) {
-    SetTensorFromPyArrayT<paddle::platform::complex64, P>(self, array, place,
-                                                          zero_copy);
-  } else if (py::isinstance<py::array_t<paddle::platform::complex128>>(array)) {
-    SetTensorFromPyArrayT<paddle::platform::complex128, P>(self, array, place,
-                                                           zero_copy);
+  } else if (py::isinstance<py::array_t<paddle::platform::complex<float>>>(
+                 array)) {
+    SetTensorFromPyArrayT<paddle::platform::complex<float>, P>(
+        self, array, place, zero_copy);
+  } else if (py::isinstance<py::array_t<paddle::platform::complex<double>>>(
+                 array)) {
+    SetTensorFromPyArrayT<paddle::platform::complex<double>, P>(
+        self, array, place, zero_copy);
   } else if (py::isinstance<py::array_t<uint16_t>>(array)) {
     // since there is still no support for bfloat16 in NumPy,
     // uint16 is used for casting bfloat16
@@ -594,9 +595,9 @@ inline framework::Tensor *_sliceTensor(const framework::Tensor &self,
     case framework::proto::VarType::BF16:
       return _sliceAndConcat<paddle::platform::bfloat16>(self, obj, dim);
     case framework::proto::VarType::COMPLEX64:
-      return _sliceAndConcat<paddle::platform::complex64>(self, obj, dim);
+      return _sliceAndConcat<paddle::platform::complex<float>>(self, obj, dim);
     case framework::proto::VarType::COMPLEX128:
-      return _sliceAndConcat<paddle::platform::complex128>(self, obj, dim);
+      return _sliceAndConcat<paddle::platform::complex<double>>(self, obj, dim);
     case framework::proto::VarType::FP32:
       return _sliceAndConcat<float>(self, obj, dim);
     case framework::proto::VarType::FP64:
diff --git a/paddle/scripts/conda_build.py b/paddle/scripts/conda_build.py
index e9153583f1337..2fe02dc51bf53 100644
--- a/paddle/scripts/conda_build.py
+++ b/paddle/scripts/conda_build.py
@@ -44,42 +44,33 @@ def __init__(self):
         self.requirement_build = r"""
 requirements:
   build:
-    - numpy>=1.12
+    - numpy>=1.13
     - cython
     - setuptools
 """
 
         self.requirement_run = r"""
   run:
-    - numpy>1.12
+    - requests>=2.20.0
+    - numpy>=1.13
+    - protobuf>=3.1.0
+    - gast==0.3.3
+    - Pillow
     - six
     - decorator
-    - nltk
-    - scipy
-    - requests
-    - pillow
-    - graphviz
-    - protobuf
-    - py-cpuinfo==5.0.0
     - astor
-    - gast>=0.3.3
-    - matplotlib
 """
 
         self.requirement_run_windows = r"""
   run:
-    - numpy>=1.12
+    - requests>=2.20.0
+    - numpy>=1.13
+    - protobuf>=3.1.0
+    - gast==0.3.3
+    - Pillow
     - six
     - decorator
-    - nltk
-    - scipy
-    - requests
-    - pillow
-    - graphviz
-    - protobuf
     - astor
-    - gast>=0.3.3
-    - py-cpuinfo==5.0.0
 """
         self.test = r"""
 test:
@@ -96,37 +87,20 @@ def __init__(self):
 """
 
         self.build_const = r"""
-pip install /package/objgraph-3.4.1.tar.gz
-pip install /package/rarfile-3.0.tar.gz --no-deps
 """
 
         self.blt_const = r""" 
-pip install C:\package\objgraph-3.4.1.tar.gz
-pip install C:\package\rarfile-3.0.tar.gz --no-deps
-git clone https://github.com/PaddlePaddle/recordio.git
-cd recordio\python
-python setup.py install
 """
 
-        self.python27 = r"    - python>=2.7, <3.0"
-        self.python35 = r"    - python>=3.5, <3.6"
         self.python36 = r"    - python>=3.6, <3.7"
         self.python37 = r"    - python>=3.7, <3.8"
         self.python38 = r"    - python>=3.8, <3.9"
+        self.python39 = r"    - python>=3.9, <3.10"
 
         self.python_version = [
-            self.python27, self.python35, self.python36, self.python37,
-            self.python38
+            self.python36, self.python37, self.python38, self.python39
         ]
 
-        self.cuda90 = r"""
-    - cudatoolkit>=9.0, <9.1
-    - cudnn>=7.6, <7.7
-    """
-        self.cuda100 = r"""
-    - cudatoolkit>=10.0, <10.1
-    - cudnn>=7.6, <7.7
-    """
         self.cuda101 = r"""
     - cudatoolkit>=10.1, <10.2
     - cudnn>=7.6, <7.7
@@ -135,30 +109,31 @@ def __init__(self):
     - cudatoolkit>=10.2, <10.3
     - cudnn>=7.6, <7.7
     """
-        self.cuda_info = [(self.cuda90, "cuda9.0", ".post90"),
-                          (self.cuda100, "cuda10.0", ".post100"),
-                          (self.cuda101, "cuda10.1", ".post101"),
-                          (self.cuda102, "cuda10.2", "")]
-        self.py_str = ["py27", "py35", "py36", "py37", "py38"]
+        self.cuda112 = r"""
+    - cudatoolkit>=11.2, <11.3
+    - cudnn>=8.1, <8.2
+    """
+
+        self.cuda_info = [(self.cuda101, "cuda10.1", ".post101"),
+                          (self.cuda102, "cuda10.2", ""),
+                          (self.cuda112, "cuda11.2", ".post112")]
+        self.py_str = ["py36", "py37", "py38", "py39"]
         self.pip_end = ".whl --no-deps"
         self.pip_prefix_linux = "pip install /package/paddlepaddle"
         self.pip_prefix_windows = r"pip install C:\package\paddlepaddle"
         self.pip_gpu = "_gpu-"
         self.pip_cpu = "-"
         self.mac_pip = [
-            "-cp27-cp27m-macosx_10_6_intel", "-cp35-cp35m-macosx_10_6_intel",
             "-cp36-cp36m-macosx_10_6_intel", "-cp37-cp37m-macosx_10_6_intel",
-            "-cp38-cp38-macosx_10_14_x86_64"
+            "-cp38-cp38-macosx_10_14_x86_64", "-cp39-cp39-macosx_10_14_x86_64"
         ]
         self.linux_pip = [
-            "-cp27-cp27mu-manylinux1_x86_64", "-cp35-cp35m-manylinux1_x86_64",
-            "-cp36-cp36m-manylinux1_x86_64", "-cp37-cp37m-manylinux1_x86_64",
-            "-cp38-cp38-manylinux1_x86_64"
+            "-cp36-cp36m-linux_x86_64", "-cp37-cp37m-linux_x86_64",
+            "-cp38-cp38-linux_x86_64", "-cp39-cp39-linux_x86_64"
         ]
         self.windows_pip = [
-            "-cp27-cp27m-win_amd64", "-cp35-cp35m-win_amd64",
             "-cp36-cp36m-win_amd64", "-cp37-cp37m-win_amd64",
-            "-cp38-cp38-win_amd64"
+            "-cp38-cp38-win_amd64", "-cp39-cp39-win_amd64"
         ]
 
 
@@ -233,12 +208,7 @@ def meta_build_windows(var,
     requirement = var.requirement_build + python_str + var.requirement_run_windows + python_str
     meta_build = var.build + build_name_str
     meta_str = package_str + meta_build + requirement
-    if (python_str == var.python27 or python_str == var.python35):
-        meta_str = meta_str + """
-    - matplotlib<=2.2.4"""
-    else:
-        meta_str = meta_str + """
-    - matplotlib"""
+
     if not (cuda_str == None):
         meta_str = meta_str + cuda_str
 
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index e53828ff10be6..69138a37f461c 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -26,11 +26,10 @@ set cache_dir=%work_dir:Paddle=cache%
 if not exist %cache_dir%\tools (
     git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
 )
-taskkill /f /im op_function_generator.exe  2>NUL
 taskkill /f /im cmake.exe  2>NUL
 taskkill /f /im MSBuild.exe 2>NUL
-taskkill /f /im CL.exe 2>NUL
-taskkill /f /im Lib.exe 2>NUL
+taskkill /f /im cl.exe 2>NUL
+taskkill /f /im lib.exe 2>NUL
 taskkill /f /im link.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
@@ -41,14 +40,12 @@ taskkill /f /im python.exe  2>NUL
 taskkill /f /im nvcc.exe 2>NUL
 taskkill /f /im cicc.exe 2>NUL
 taskkill /f /im ptxas.exe 2>NUL
-taskkill /f /im test_api_impl.exe 2>NUL
 taskkill /f /im op_function_generator.exe 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
-wmic process where name="test_api_impl.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
-wmic process where name="CL.exe" call terminate 2>NUL
-wmic process where name="Lib.exe" call terminate 2>NUL
+wmic process where name="cl.exe" call terminate 2>NUL
+wmic process where name="lib.exe" call terminate 2>NUL
 wmic process where name="python.exe" call terminate 2>NUL
 
 rem ------initialize common variable------
@@ -79,6 +76,7 @@ if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
 
 rem -------set cache build directory-----------
 rmdir build\python /s/q
+rmdir build\paddle\fluid\pybind /s/q
 rmdir build\paddle_install_dir /s/q
 rmdir build\paddle_inference_install_dir /s/q
 rmdir build\paddle_inference_c_install_dir /s/q
@@ -112,6 +110,17 @@ if %ERRORLEVEL% EQU 0 (
     git branch last_pr
 )
 
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
+set day_now=%datetime:~6,2%
+set day_before=-1
+set /p day_before=< %cache_dir%\day.txt
+if %day_now% NEQ %day_before% (
+    echo %day_now% > %cache_dir%\day.txt
+    type %cache_dir%\day.txt
+    rmdir build /s/q
+    goto :mkbuild
+)
+
 :: git diff HEAD origin/develop --stat --name-only
 :: git diff HEAD origin/develop --stat --name-only | findstr ".cmake CMakeLists.txt paddle_build.bat"
 :: if %ERRORLEVEL% EQU 0 (
@@ -137,10 +146,11 @@ goto :CASE_%1
 
 echo "Usage: paddle_build.bat [OPTION]"
 echo "OPTION:"
-echo "wincheck_mkl: run Windows MKL/GPU/UnitTest CI tasks on Windows"
-echo "wincheck_openbals: run Windows OPENBLAS/CPU CI tasks on Windows"
+echo "wincheck_mkl: run Windows MKL/GPU PR CI tasks on Windows"
+echo "wincheck_openbals: run Windows OPENBLAS/CPU PR CI tasks on Windows"
 echo "build_avx_whl: build Windows avx whl package on Windows"
 echo "build_no_avx_whl: build Windows no avx whl package on Windows"
+echo "build_inference_lib: build Windows inference library on Windows"
 exit /b 1
 
 rem ------PR CI windows check for MKL/GPU----------
@@ -200,6 +210,7 @@ goto:success
 
 rem ------Build windows inference library------
 :CASE_build_inference_lib
+set ON_INFER=ON
 set WITH_PYTHON=OFF
 set CUDA_ARCH_NAME=All
 
@@ -249,9 +260,10 @@ if "%WITH_GPU%"=="ON" (
 )
 
 rem ------initialize the python environment------
+@ECHO ON
 set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
 set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
-if %WITH_PYTHON% == "OFF" (
+if %WITH_PYTHON% == "ON" (
     where python
     where pip
     pip install wheel --user
@@ -373,6 +385,7 @@ set build_times=1
 rem clcache.exe -z
 
 rem -------clean up environment again-----------
+taskkill /f /im cmake.exe  2>NUL
 taskkill /f /im MSBuild.exe 2>NUL
 taskkill /f /im cl.exe 2>NUL
 taskkill /f /im lib.exe 2>NUL
@@ -385,14 +398,17 @@ taskkill /f /im csc.exe 2>NUL
 taskkill /f /im nvcc.exe 2>NUL
 taskkill /f /im cicc.exe 2>NUL
 taskkill /f /im ptxas.exe 2>NUL
-taskkill /f /im test_api_impl.exe 2>NUL
 taskkill /f /im op_function_generator.exe 2>NUL
+wmic process where name="cmake.exe" call terminate 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
-wmic process where name="test_api_impl.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
-wmic process where name="CL.exe" call terminate 2>NUL
-wmic process where name="Lib.exe" call terminate 2>NUL
+wmic process where name="cl.exe" call terminate 2>NUL
+wmic process where name="lib.exe" call terminate 2>NUL
+
+if "%WITH_TESTING%"=="ON" (
+    for /F "tokens=1 delims= " %%# in ('tasklist ^| findstr /i test') do taskkill /f /im %%#
+)
 
 echo Build Paddle the %build_times% time:
 if %GENERATOR% == "Ninja" (
@@ -760,15 +776,16 @@ taskkill /f /im python.exe  2>NUL
 taskkill /f /im nvcc.exe 2>NUL
 taskkill /f /im cicc.exe 2>NUL
 taskkill /f /im ptxas.exe 2>NUL
-taskkill /f /im test_api_impl.exe 2>NUL
 taskkill /f /im op_function_generator.exe 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
-wmic process where name="test_api_impl.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
-wmic process where name="CL.exe" call terminate 2>NUL
-wmic process where name="Lib.exe" call terminate 2>NUL
+wmic process where name="cl.exe" call terminate 2>NUL
+wmic process where name="lib.exe" call terminate 2>NUL
 wmic process where name="python.exe" call terminate 2>NUL
+if "%WITH_TESTING%"=="ON" (
+    for /F "tokens=1 delims= " %%# in ('tasklist ^| findstr /i test') do taskkill /f /im %%#
+)
 echo Windows CI run successfully!
 exit /b 0
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 0865d48c0d343..ff3ded9f9ea56 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -248,6 +248,12 @@ function cmake_base() {
     distibuted_flag=${WITH_DISTRIBUTE:-OFF}
     gloo_flag=${distibuted_flag}
 
+    if [ "$CMD" != "assert_file_approvals" ];then
+      python -m pip install distro
+      python ${PADDLE_ROOT}/tools/summary_env.py
+      bash ${PADDLE_ROOT}/tools/get_cpu_info.sh
+    fi
+
     cat <<EOF
     ========================================
     Configuring cmake in /paddle/build ...
@@ -818,7 +824,7 @@ function generate_api_spec() {
 
     awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc
     awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api
-    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" || [ "$1" == "cp39-cp39" ]; then
+    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" ] || [ "$1" == "cp39-cp39" ]; then
         # Use sed to make python2 and python3 sepc keeps the same
         sed -i 's/arg0: str/arg0: unicode/g' $spec_path
         sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path
@@ -1256,11 +1262,13 @@ set +x
                 testcase=''
         done <<< "$test_cases";
 
-        card_test "$single_card_tests_high_parallel" 1 8        # run cases the most each time with single GPU
+        card_test "$single_card_tests_high_parallel" 1 6        # run cases the most each time with single GPU
         card_test "$single_card_tests_two_parallel" 1 2         # run cases 2 job each time with single GPU
         card_test "$single_card_tests_non_parallel" 1           # run cases 1 job each time with single GPU
+        
         card_test "$multiple_card_tests_two_parallel" 2 2       # run cases 2 job each time with two GPUs
         card_test "$multiple_card_tests_non_parallel" 2         # run cases 1 job each time with two GPUs
+        
         card_test "$exclusive_tests_two_parallel" -1 2          # run cases exclusively, in this cases would be run with 2/4/8 GPUs
         card_test "$exclusive_tests_non_parallel" -1            # run cases exclusively, in this cases would be run with 2/4/8 GPUs
         collect_failed_tests
@@ -1409,6 +1417,175 @@ EOF
     fi
 }
 
+function insert_pile_to_h_cu_diff {
+    # TODO get develop h/cu md5
+    cd ${PADDLE_ROOT}
+    find ${PADDLE_ROOT} -name '*.h'| grep -v ${PADDLE_ROOT}/build >> ${PADDLE_ROOT}/tools/h_cu_files.log
+    find ${PADDLE_ROOT} -name '*.cu'| grep -v ${PADDLE_ROOT}/build >> ${PADDLE_ROOT}/tools/h_cu_files.log
+    python ${PADDLE_ROOT}/tools/handle_h_cu_file.py 'get_h_file_md5' ${PADDLE_ROOT}
+    
+    # TODO insert pile to diff h/cu file 
+
+    #insert pile to full h/cu file 
+    python ${PADDLE_ROOT}/tools/handle_h_cu_file.py 'insert_pile_to_h_file' ${PADDLE_ROOT}
+}
+
+function precise_card_test_single {
+    set +e
+    set +x
+    testcases=$1
+    num=$2
+    for case in $(echo $testcases | tr "$|^" "\n")
+    do
+        cd ${PADDLE_ROOT}/build
+        precise_card_test "^${case}$" $num
+        # c++ 
+        if [ -d "${PADDLE_ROOT}/build/ut_map/$case" ];then
+            rm -rf ${PADDLE_ROOT}/build/ut_map/$case
+        fi
+        set -x
+        mkdir ${PADDLE_ROOT}/build/ut_map/$case
+        find paddle/fluid -name '*.gcda'|xargs -I {} cp --path {} ut_map/$case
+        find paddle/fluid -name '*.gcno'|xargs -I {} cp --path {} ut_map/$case
+        python ${PADDLE_ROOT}/tools/get_single_test_cov.py ${PADDLE_ROOT} $case &
+        
+        # python
+        ls python-coverage.data.*
+        if [[ $? == 0 ]]
+        then
+            mkdir -p ${PADDLE_ROOT}/build/pytest/$case
+            mv python-coverage.data.* ${PADDLE_ROOT}/build/pytest/$case
+        fi
+        find paddle/fluid -name *.gcda | xargs rm -f #delete gcda
+    done
+}
+
+function precise_card_test() {
+    set -m
+    testcases=$1
+    if (( $# > 1 )); then
+        cardnumber=$2
+        cuda_list="0"
+        if [ $cardnumber -eq 2 ]; then
+            cuda_list=${CUDA_VISIBLE_DEVICES}
+        else
+            cuda_list="0"
+        fi
+    else
+        cardnumber=2
+        cuda_list=${CUDA_VISIBLE_DEVICES}
+    fi
+
+    if [[ "$testcases" == "" ]]; then
+        return 0
+    fi
+
+    echo "****************************************************************"
+    echo "***Running ut: $testcases***"
+    echo "****************************************************************"
+    
+    tmpfile=$tmp_dir/$testcases".log"
+    env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I 0,,1 -R "($testcases)" --timeout 500 --output-on-failure -V -j 1 > $tmpfile 
+    set +m
+}
+
+function get_precise_tests_map_file {
+    cd ${PADDLE_ROOT}/build
+    pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    ut_total_startTime_s=`date +%s`
+    EXIT_CODE=0;
+    test_cases=$(ctest -N -V) # get all test cases
+    single_card_tests='' # all cases list which would take one graph card
+    exclusive_tests=''        # cases list which would be run exclusively
+    multiple_card_tests=''    # cases list which would take multiple GPUs, most cases would be two GPUs
+    is_exclusive=''           # indicate whether the case is exclusive type
+    is_multicard=''           # indicate whether the case is multiple GPUs type
+set +x
+
+    while read -r line; do
+        if [[ "$line" == "" ]]; then
+            continue
+        fi
+            read matchstr <<< $(echo "$line"|grep -oEi 'Test[ \t]+#')
+            if [[ "$matchstr" == "" ]]; then
+                # Any test case with LABELS property would be parse here
+                # RUN_TYPE=EXCLUSIVE mean the case would run exclusively
+                # RUN_TYPE=DIST mean the case would take two graph GPUs during runtime
+                read is_exclusive <<< $(echo "$line"|grep -oEi "RUN_TYPE=EXCLUSIVE")
+                read is_multicard <<< $(echo "$line"|grep -oEi "RUN_TYPE=DIST")
+                continue
+            fi
+            read testcase <<< $(echo "$line"|grep -oEi "\w+$")
+
+            if [[ "$is_multicard" == "" ]]; then
+                # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
+                read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist_")
+            fi
+
+            if [[ "$is_exclusive" != "" ]]; then
+                if [[ "$exclusive_tests" == "" ]]; then
+                    exclusive_tests="^$testcase$"
+                else
+                    exclusive_tests="$exclusive_tests|^$testcase$"
+                fi
+            elif [[ "$is_multicard" != "" ]]; then
+                if [[ "$multiple_card_tests" == "" ]]; then
+                    multiple_card_tests="^$testcase$"
+                else
+                    multiple_card_tests="$multiple_card_tests|^$testcase$"
+                fi
+            else
+                if [[ "${single_card_tests}" -gt 3000 ]];then
+                    if [[ "$single_card_tests_1" == "" ]]; then
+                        single_card_tests_1="^$testcase$"
+                    else
+                        single_card_tests_1="$single_card_tests_1|^$testcase$"
+                    fi
+                    continue
+                fi
+                if [[ "$single_card_tests" == "" ]]; then
+                    single_card_tests="^$testcase$"
+                else
+                    single_card_tests="$single_card_tests|^$testcase$"
+                fi
+            fi
+            is_exclusive=''
+            is_multicard=''
+            is_nightly=''
+            matchstr=''
+            testcase=''
+    done <<< "$test_cases";
+
+set -x
+    mkdir -p ${PADDLE_ROOT}/build/ut_map
+    mkdir -p ${PADDLE_ROOT}/build/pytest
+
+    precise_card_test_single "$single_card_tests" 1
+    precise_card_test_single "$single_card_tests_1" 1
+    precise_card_test_single "$multiple_card_tests" 2
+    precise_card_test_single "$exclusive_tests"
+
+    python ${PADDLE_ROOT}/tools/get_ut_file_map.py 'get_not_success_ut' ${PADDLE_ROOT}
+    
+    if [[ -f "${PADDLE_ROOT}/build/utNotSuccess" ]]; then
+        rerun_tests=`cat ${PADDLE_ROOT}/build/utNotSuccess`
+        precise_card_test_single "$rerun_tests"
+    fi
+    wait;
+
+    #generate python coverage and generate python file to tests_map_file
+    python ${PADDLE_ROOT}/tools/pyCov_multithreading.py ${PADDLE_ROOT}
+
+    #analy h/cu to Map file
+    python ${PADDLE_ROOT}/tools/handle_h_cu_file.py 'analy_h_cu_file' $tmp_dir ${PADDLE_ROOT}
+
+    #generate ut map
+    python ${PADDLE_ROOT}/tools/get_ut_file_map.py 'get_ut_map' ${PADDLE_ROOT}
+    wait;
+}
+
+
+
 function parallel_test_base_xpu() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -1446,11 +1623,11 @@ set -x
 }
 
 function parallel_test() {
-    ut_total_startTime_s=`date +%s`
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
     cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
+    ut_total_startTime_s=`date +%s`
     if [ "$WITH_GPU" == "ON" ] || [ "$WITH_ROCM" == "ON" ];then
         parallel_test_base_gpu
     else
@@ -1902,10 +2079,6 @@ function main() {
     local CMD=$1 
     local parallel_number=$2
     init
-    if [ "$CMD" != "assert_file_approvals" ];then
-      python ${PADDLE_ROOT}/tools/summary_env.py
-      bash ${PADDLE_ROOT}/tools/get_cpu_info.sh
-    fi
     case $CMD in
       build_only)
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
@@ -1982,6 +2155,12 @@ function main() {
         check_coverage
         check_change_of_unittest ${PYTHON_ABI:-""}
         ;;
+      ci_preciseTest)
+        insert_pile_to_h_cu_diff 
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        enable_unused_var_check
+        get_precise_tests_map_file
+        ;;
       cicheck_brpc)
         cmake_gen ${PYTHON_ABI:-""}
         build ${parallel_number}
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 054fcdfcbe651..ee4dcaa897940 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -269,10 +269,10 @@
 
 # high-level api
 from .hapi import Model  # noqa: F401
-from .hapi import callbacks  # noqa: F401
+from . import callbacks  # noqa: F401
 from .hapi import summary  # noqa: F401
 from .hapi import flops  # noqa: F401
-from .hapi import hub  # noqa: F401
+from . import hub  # noqa: F401
 
 import paddle.text  # noqa: F401
 import paddle.vision  # noqa: F401
@@ -335,10 +335,8 @@
            'unsqueeze_',
            'argmax',
            'Model',
-           'callbacks',
            'summary',
            'flops',
-           'hub',
            'sort',
            'split',
            'logical_and',
diff --git a/python/paddle/callbacks.py b/python/paddle/callbacks.py
new file mode 100644
index 0000000000000..08fab3e0adb5e
--- /dev/null
+++ b/python/paddle/callbacks.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .hapi.callbacks import Callback  # noqa: F401
+from .hapi.callbacks import ProgBarLogger  # noqa: F401
+from .hapi.callbacks import ModelCheckpoint  # noqa: F401
+from .hapi.callbacks import VisualDL  # noqa: F401
+from .hapi.callbacks import LRScheduler  # noqa: F401
+from .hapi.callbacks import EarlyStopping  # noqa: F401
+from .hapi.callbacks import ReduceLROnPlateau  # noqa: F401
+
+__all__ = [  #noqa
+    'Callback',
+    'ProgBarLogger',
+    'ModelCheckpoint',
+    'VisualDL',
+    'LRScheduler',
+    'EarlyStopping',
+    'ReduceLROnPlateau'
+]
diff --git a/python/paddle/device.py b/python/paddle/device.py
index 035d240e713fe..803d54e11bea3 100644
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -25,7 +25,7 @@
     'set_device',
     'get_device',
     'XPUPlace',
-    'is_compiled_with_xpu'
+    'is_compiled_with_xpu',
     #            'cpu_places',
     #            'CPUPlace',
     #            'cuda_pinned_places',
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index ba4c3b09f9ff7..d3df57fcf6b7d 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -25,6 +25,7 @@
 from ..fluid.data_feeder import check_dtype
 from ..fluid.layers.tensor import fill_constant
 from ..fluid.layers import utils
+from ..fluid.dygraph import layers
 from ..fluid.dygraph.parallel import prepare_context
 import paddle
 from .fleet import fleet
@@ -98,6 +99,13 @@ def get_group_rank(self, rank):
         else:
             return -1
 
+    def __repr__(self):
+        debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format(
+            self.rank, self.nranks, self.id)
+        debug_str += ", ".join(map(str, self.ranks))
+        debug_str += ". "
+        return debug_str
+
 
 _global_env = None
 
@@ -875,6 +883,86 @@ def _mp_allreduce(tensor,
         raise NotImplementedError("No support _mp_allreduce in dygraph mode.")
 
 
+class _Linear(layers.Layer):
+    """
+    Linear
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(_Linear, self).__init__()
+        self._dtype = self._helper.get_default_dtype()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self.weight = self.create_parameter(
+            shape=[in_features, out_features],
+            attr=self._weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.bias = self.create_parameter(
+            shape=[out_features],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+        self.name = name
+
+    def forward(self, input):
+        out = _linear(
+            x=input, weight=self.weight, bias=self.bias, name=self.name)
+        return out
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self.name) if self.name else ''
+        return 'in_features={}, out_features={}, dtype={}{}'.format(
+            self.weight.shape[0], self.weight.shape[1], self._dtype, name_str)
+
+
+def _linear(x, weight, bias=None, name=None):
+    """
+    Fuction Linear
+    """
+    if in_dygraph_mode():
+        pre_bias = _varbase_creator(dtype=x.dtype)
+        core.ops.matmul(x, weight, pre_bias, 'transpose_X', False,
+                        'transpose_Y', False, "alpha", 1)
+        return dygraph_utils._append_bias_in_dygraph(
+            pre_bias, bias, axis=len(x.shape) - 1)
+    else:
+        helper = LayerHelper('linear', **locals())
+        dtype = x.dtype
+        assert len(
+            x.shape) < 4, "X latitude is not supported greater than 3 now."
+
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'linear')
+        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear')
+
+        inputs = {'X': [x], 'Y': [weight]}
+        attrs = {
+            'transpose_X': False,
+            'transpose_Y': False,
+            'alpha': 1,
+        }
+        tmp = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='matmul_v2', inputs=inputs, outputs={'Out': tmp}, attrs=attrs)
+        if bias is not None:
+            res = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [tmp],
+                        'Y': [bias]},
+                outputs={'Out': [res]},
+                attrs={'axis': len(x.shape) - 1})
+        else:
+            res = tmp
+        return res
+
+
 def _parallel_linear(x,
                      num_rows,
                      num_cols,
@@ -900,12 +988,20 @@ def _parallel_linear(x,
     else:
         x = _c_identity(x, group=group)
 
-    linear = paddle.nn.Linear(
-        num_rows,
-        num_cols,
-        weight_attr=param_attr,
-        bias_attr=bias_attr,
-        name=name)
+    if core.is_compiled_with_npu():
+        linear = _Linear(
+            num_rows,
+            num_cols,
+            weight_attr=param_attr,
+            bias_attr=bias_attr,
+            name=name)
+    else:
+        linear = paddle.nn.Linear(
+            num_rows,
+            num_cols,
+            weight_attr=param_attr,
+            bias_attr=bias_attr,
+            name=name)
 
     linear_out = linear(x)
     startup_block = paddle.static.default_startup_program().global_block()
diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py
index 708c76ac55abe..27437c50fad66 100644
--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
@@ -74,10 +74,17 @@ def _get_ascend_rankfile(rank_table_file_path):
     device_count = 0
     server_list = json_data['server_list']
     for server in server_list:
-        node_ips.append(server['server_id'])
         device_list = server['device']
         device_count = len(device_list)
-
+        if os.getenv("FLAGS_MODELARTS", None):
+            nodes = os.getenv("DLS_TASK_NUMBER", None)
+            assert nodes is not None, "DLS_TASK_NUMBER didn't set!"
+            for node in range(int(nodes)):
+                node_ip = os.getenv(f"VC_CUSTOM{node}_HOSTS", None)
+                assert node_ip is not None, f"VC_CUSTOM{node}_HOSTS didn't set!"
+                node_ips.append(node_ip)
+            return node_ips, device_count
+        node_ips.append(server['server_id'])
     return node_ips, device_count
 
 
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index a44d008fe9a31..f9cd623afef76 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -626,7 +626,7 @@ def find_unused_parameters(self):
         Indicating whether we are using find_unused_parameters to 
         find unused parameters in DataParallel.
 
-        Default value: True
+        Default value: False
 
         Examples:
 
@@ -827,6 +827,32 @@ def sharding_configs(self, configs):
                           "sharding_configs")
         assign_configs_value(self.strategy.sharding_configs, configs)
 
+    @property
+    def without_graph_optimization(self):
+        """
+        Run program using Executor other than ParallelExecutor.
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.without_graph_optimization = True
+
+        """
+        return self.strategy.without_graph_optimization
+
+    @without_graph_optimization.setter
+    @is_strict_auto
+    def without_graph_optimization(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.without_graph_optimization = flag
+        else:
+            print(
+                "WARNING: without_graph_optimization should have value of bool type"
+            )
+
     @property
     def pipeline(self):
         """
@@ -923,6 +949,8 @@ def tensor_parallel_configs(self):
         **Notes**:
             **Detailed arguments for tensor_parallel_configs**
             **tensor_parallel_degree**: degree of tensor parallel
+            **tensor_init_seed**: parameter initialization random seed
+
 
         Examples:
 
@@ -931,7 +959,8 @@ def tensor_parallel_configs(self):
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.tensor_parallel = True
-            strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4}
+            strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4,
+                                                "tensor_init_seed": 123}
 
         """
         return get_msg_dict(self.strategy.tensor_parallel_configs)
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index a7564a23a7cfb..5e883f1ac6cc9 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -17,6 +17,7 @@
 import warnings
 import paddle
 import os
+import numpy as np
 from paddle.fluid.framework import dygraph_only
 from paddle.fluid import compiler
 from .role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker, RoleMakerBase
@@ -28,7 +29,7 @@
 from paddle.fluid.dygraph import parallel_helper
 from . import topology as tp
 from .topology import ParallelMode
-from ..meta_parallel import ModelParallel
+from ..meta_parallel import TensorParallel, model_parallel_random_seed
 from ..meta_parallel import PipelineParallel
 from ..meta_optimizers import HybridParallelOptimizer
 from ..meta_optimizers import HybridParallelGradScaler
@@ -279,6 +280,14 @@ def _init_hybrid_parallel_env(self):
 
         self._hcg = tp.HybridCommunicateGroup(self._topology)
 
+        if self.mp_degree > 1:
+            tensor_parallel_configs = self._user_defined_strategy.tensor_parallel_configs
+            tensor_init_seed = tensor_parallel_configs["tensor_init_seed"]
+            if tensor_init_seed == -1:
+                model_parallel_random_seed()
+            else:
+                model_parallel_random_seed(tensor_init_seed)
+
     def get_hybrid_communicate_group(self):
         assert self._hcg is not None
         return self._hcg
@@ -580,6 +589,49 @@ def stop_worker(self):
         """
         self._runtime_handle._stop_worker()
 
+    def save(self, dirname, feed=[], fetch=[], **configs):
+        inference = True
+
+        if not feed and not fetch:
+            inference = False
+
+        place = paddle.CPUPlace()
+        executor = paddle.static.Executor(place)
+
+        if inference:
+            feeded_var_names = []
+            fetch_var_names = []
+
+            for var in feed:
+                if isinstance(var, str):
+                    feeded_var_names.append(var)
+                elif isinstance(var, paddle.static.Variable):
+                    feeded_var_names.append(var.name)
+                else:
+                    raise ValueError("feed must be [str|Variable]")
+
+            for var in fetch:
+                if isinstance(var, str):
+                    fetch_var_names.append(var)
+                elif isinstance(var, paddle.static.Variable):
+                    fetch_var_names.append(var.name)
+                else:
+                    raise ValueError("feed must be [str|Variable]")
+
+            fetch_vars = [
+                paddle.static.default_main_program().global_block().var(name)
+                for name in fetch_var_names
+            ]
+
+            self._runtime_handle._save_inference_model(
+                executor, dirname, feeded_var_names, fetch_vars, None, True, 0)
+        else:
+            increment_mode = 0
+            if "mode" in configs:
+                increment_mode = int(configs["mode"])
+            self._runtime_handle._save_persistables(
+                executor, dirname, main_program=None, mode=increment_mode)
+
     def save_inference_model(self,
                              executor,
                              dirname,
@@ -607,6 +659,9 @@ def save_inference_model(self,
                 fleet.init_server()
 
         """
+        # warnings.warn(
+        #     "'save_inference_model' is a deprecated, will be deleted after v2.2.0, Please use fleet.save instead."
+        # )
 
         self._runtime_handle._save_inference_model(
             executor, dirname, feeded_var_names, target_vars, main_program,
@@ -653,6 +708,9 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0):
                 fleet.save_persistables(exe, "dirname", paddle.static.default_main_program())
 
         """
+        # warnings.warn(
+        #     "'save_persistables' is a deprecated, will be deleted after v2.2.0, Please use fleet.save instead."
+        # )
 
         self._runtime_handle._save_persistables(executor, dirname, main_program,
                                                 mode)
@@ -780,8 +838,8 @@ def forward(self, x):
                 last_comm_group_size_MB,
                 find_unused_parameters=self._user_defined_strategy.
                 find_unused_parameters)
-        elif self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL:
-            distributed_model = ModelParallel(
+        elif self._hcg.get_parallel_mode() == ParallelMode.TENSOR_PARALLEL:
+            distributed_model = TensorParallel(
                 model, self._hcg, strategy=self._user_defined_strategy)
         elif self._hcg.get_parallel_mode() == ParallelMode.PIPELINE_PARALLEL:
             distributed_model = PipelineParallel(
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index 470a4d83aac3f..04d8417fdcbf3 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -28,7 +28,7 @@
 
 class ParallelMode(object):
     DATA_PARALLEL = 0
-    MODEL_PARALLEL = 1
+    TENSOR_PARALLEL = 1
     PIPELINE_PARALLEL = 2
 
 
@@ -155,12 +155,12 @@ def __init__(self, topology):
         _HYBRID_PARALLEL_GROUP = self
 
     def get_parallel_mode(self):
-        # there are three modes : DataParallel / ModelParallel / PipelineParallel
+        # there are three modes : DataParallel / TensorParallel / PipelineParallel
         if self._mp_degree == 1 and self._pp_degree == 1:
             return ParallelMode.DATA_PARALLEL
         elif self._mp_degree > 1 and self._pp_degree == 1:
             # initialize the seed
-            return ParallelMode.MODEL_PARALLEL
+            return ParallelMode.TENSOR_PARALLEL
         elif self._pp_degree > 1:
             return ParallelMode.PIPELINE_PARALLEL
 
@@ -253,3 +253,8 @@ def get_pipe_parallel_group(self):
     # check parallel group
     def get_check_parallel_group(self):
         return self._check_comm_group
+
+    def get_rank_from_stage(self, stage_id):
+        coord = self._topo.get_coord(self.global_rank)
+        tf = coord._replace(pipe=stage_id)._asdict()
+        return self._topo.get_rank(**tf)
diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py
index 2f428346b9c0c..dc41e3589812f 100644
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -33,6 +33,7 @@ def __init__(self):
         self.dataset = core.Dataset("MultiSlotDataset")
         self.thread_num = 1
         self.filelist = []
+        self.use_ps_gpu = False
 
     def init(self,
              batch_size=1,
@@ -214,6 +215,15 @@ def _prepare_to_run(self):
         self.dataset.set_data_feed_desc(self._desc())
         self.dataset.create_readers()
 
+    def _set_use_ps_gpu(self, use_ps_gpu):
+        """
+        set use_ps_gpu flag
+
+        Args:
+            use_ps_gpu: bool
+        """
+        self.use_ps_gpu = use_ps_gpu
+
     def _finish_to_run(self):
         self.dataset.destroy_readers()
 
@@ -531,12 +541,18 @@ def _prepare_to_run(self):
 
     def _dynamic_adjust_before_train(self, thread_num):
         if not self.is_user_set_queue_num:
-            self.dataset.dynamic_adjust_channel_num(thread_num, False)
+            if self.use_ps_gpu:
+                self.dataset.dynamic_adjust_channel_num(thread_num, True)
+            else:
+                self.dataset.dynamic_adjust_channel_num(thread_num, False)
         self.dataset.dynamic_adjust_readers_num(thread_num)
 
     def _dynamic_adjust_after_train(self):
         if not self.is_user_set_queue_num:
-            self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
+            if self.use_ps_gpu:
+                self.dataset.dynamic_adjust_channel_num(self.thread_num, True)
+            else:
+                self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
         self.dataset.dynamic_adjust_readers_num(self.thread_num)
 
     def _set_queue_num(self, queue_num):
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index be7ad257ccb99..c69b21538b61a 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import functools
 import logging
-import socket
 import time
 import os
 import signal
@@ -27,6 +25,7 @@
 import socket
 import warnings
 import six
+import struct
 
 import paddle
 import paddle.fluid as fluid
@@ -362,6 +361,10 @@ def add_arguments(argname, type, default, help, argparser, **kwargs):
 def find_free_ports(num):
     def __free_port():
         with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            # Note(wangxi): Close the connection with a TCP RST instead
+            # of a TCP FIN, to avoid time_wait state.
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER,
+                         struct.pack('ii', 1, 0))
             s.bind(('', 0))
             return s.getsockname()[1]
 
@@ -376,7 +379,7 @@ def __free_port():
             return port_set
 
         step += 1
-        if step > 100:
+        if step > 400:
             print(
                 "can't find avilable port and use the specified static port now!"
             )
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index 827835fde20e3..1788e044fe885 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -28,3 +28,4 @@
 from .dygraph_optimizer import HybridParallelOptimizer
 from .dygraph_optimizer import HybridParallelGradScaler
 from .tensor_parallel_optimizer import TensorParallelOptimizer
+from .raw_program_optimizer import RawProgramOptimizer
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
index d0e8034f5cae1..c0f671e7e446b 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -31,7 +31,7 @@ def __init__(self, scaler, hcg):
         self._scaler = scaler
         self._hcg = hcg
         self._is_mp = (
-            self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL)
+            self._hcg.get_parallel_mode() == ParallelMode.TENSOR_PARALLEL)
 
     def scale(self, var):
         return self._scaler.scale(var)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index b7ac298d2223e..c2d79a62c7663 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -89,13 +89,15 @@ def __init__(self, optimizer, hcg, strategy):
         self._inner_opt = optimizer
         self._strategy = strategy
         self._hcg = hcg
-        self._is_mp = (
-            self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL)
+
+        self._use_dp_mode = (
+            self._hcg.get_parallel_mode() == ParallelMode.DATA_PARALLEL)
+
         self._need_dp = (self._hcg.get_data_parallel_world_size() > 1)
 
         if isinstance(self._inner_opt._grad_clip,
-                      ClipGradByGlobalNorm) and self._is_mp:
-            logger.warning("using ClipGradByGlobalNorm in ModelParallel, the origin " \
+                      ClipGradByGlobalNorm) and not self._use_dp_mode:
+            logger.warning("using ClipGradByGlobalNorm in TensorParallel, the origin " \
                   "optmizer'grad clip will be changed.")
             self._inner_opt._grad_clip = HybridParallelClipGrad(
                 self._inner_opt._grad_clip, hcg)
@@ -103,7 +105,7 @@ def __init__(self, optimizer, hcg, strategy):
     @imperative_base.no_grad
     @framework.dygraph_only
     def step(self):
-        if self._is_mp and self._need_dp:
+        if not self._use_dp_mode and self._need_dp:
             fused_allreduce_gradients(
                 list(self._inner_opt._parameter_list), self._hcg)
         self._inner_opt.step()
@@ -119,7 +121,7 @@ def minimize(self,
         parameter_list = parameters if parameters \
             else self._parameter_list
 
-        if self._is_mp and self._need_dp:
+        if not self._use_dp_mode and self._need_dp:
             fused_allreduce_gradients(list(parameter_list), self._hcg)
 
         return self._inner_opt.minimize(loss, startup_program, parameters,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index 4194cf13d2bbc..22ed3f2ac4160 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -63,9 +63,9 @@ def _setup_nccl_op(self, startup_program, main_program, build_strategy):
         trainer_endpoints_env = ",".join(trainer_endpoints)
         trainers_num = self.role_maker._worker_num()
 
-        # FIXME(wangxi): approve this.
-        #if trainer_id == 0:
-        #    wait_server_ready(other_trainers)
+        # NOTE(wangxi): npu don't need to wait server ready
+        if trainer_id == 0 and not paddle.is_compiled_with_npu():
+            wait_server_ready(other_trainers)
 
         if core.is_compiled_with_cuda():
             comm_id_var = startup_program.global_block().create_var(
diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
new file mode 100755
index 0000000000000..243f6efe53185
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
@@ -0,0 +1,196 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from __future__ import print_function
+from __future__ import division
+import os
+
+import paddle.fluid as fluid
+from paddle.fluid import core, unique_name
+from ..base.private_helper_function import wait_server_ready
+from .meta_optimizer_base import MetaOptimizerBase
+from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_loss_grad_op, is_backward_op, is_optimizer_op
+
+
+class RawProgramOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(RawProgramOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        self.meta_optimizers_white_list = [
+            "RecomputeOptimizer",
+            "AMPOptimizer",
+        ]
+        self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
+        self.global_ring_id = 0
+
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        super(RawProgramOptimizer, self)._set_basic_info(
+            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        self.without_graph_optimization = user_defined_strategy.without_graph_optimization
+
+    def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
+        if self.without_graph_optimization == True:
+            return True
+        return False
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.without_graph_optimization = False
+
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.without_graph_optimization = True
+
+    def _broadcast_params(self, ring_id):
+        block = self.startup_program.global_block()
+        param = None
+        for param in block.iter_parameters():
+            if param.is_distributed:
+                continue
+
+            block.append_op(
+                type='c_broadcast',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={
+                    'ring_id': ring_id,
+                    'root': 0,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+
+        if not param: return  # no parameter on this device
+        block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': param},
+            outputs={'Out': param},
+            attrs={'ring_id': ring_id,
+                   OP_ROLE_KEY: OpRole.Forward})
+
+    def _get_process_group_info(self):
+        # global ring info
+        self.global_endpoints = self.endpoints
+        self.global_rank = self.rank
+        self.global_nranks = self.nranks
+
+    def _init_process_group(self):
+        self._get_process_group_info()
+        collective_helper = CollectiveHelper(self.role_maker, wait_port=False)
+        # Create global ring for all gpus (ring_id = 0)
+        collective_helper._init_communicator(
+            self.startup_program, self.current_endpoint, self.global_endpoints,
+            self.global_rank, self.global_ring_id, True, self.global_ring_id,
+            True)
+        self._broadcast_params(self.global_ring_id)
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        self.endpoints = self.role_maker._get_trainer_endpoints()
+        self.current_endpoint = self.endpoints[self.role_maker._worker_index()]
+        self.rank = self.role_maker._worker_index()
+        self.nranks = self.role_maker._worker_num()
+        if startup_program is None:
+            startup_program = fluid.default_startup_program()
+        self.startup_program = startup_program
+
+        block = loss.block
+        program = block.program
+        self.main_program = program
+
+        optimize_ops, params_grads = self.inner_opt.minimize(
+            loss, startup_program, parameter_list, no_grad_set)
+
+        self._init_process_group()
+
+        self.main_program = program
+        if self.nranks > 1:
+            self._transpile_main_program(loss)
+        return optimize_ops, params_grads
+
+    def _transpile_main_program(self, loss):
+        self._insert_loss_grad_ops(loss)
+        self._insert_allreduce_ops()
+
+    def _insert_loss_grad_ops(self, loss):
+        """
+        In order to keep the learning rate consistent in different numbers of
+        training workers, we scale the loss grad by the number of workers
+        """
+        block = self.main_program.global_block()
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_loss_grad_op(op):
+                loss_grad_var = block.vars[op.output_arg_names[0]]
+                block._insert_op(
+                    idx + 1,
+                    type='scale',
+                    inputs={'X': loss_grad_var},
+                    outputs={'Out': loss_grad_var},
+                    attrs={
+                        'scale': 1.0 / self.nranks,
+                        OP_ROLE_KEY: OpRole.Backward
+                    })
+
+    def _insert_allreduce_ops(self):
+        block = self.main_program.global_block()
+        ring_id = self.global_ring_id
+        grad = None
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_backward_op(op) and \
+                    OP_ROLE_VAR_KEY in op.attr_names:
+                op_role_var = op.attr(OP_ROLE_VAR_KEY)
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+                offset = 1
+                for i in range(0, len(op_role_var), 2):
+                    param_name = op_role_var[i]
+                    param = block.var(param_name)
+                    grad_name = op_role_var[i + 1]
+                    grad = block.var(grad_name)
+                    if param.is_distributed:
+                        continue
+
+                    block._insert_op(
+                        idx + offset,
+                        type='c_sync_calc_stream',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={OP_ROLE_KEY: OpRole.Backward, })
+                    offset += 1
+                    block._insert_op(
+                        idx + offset,
+                        type='c_allreduce_sum',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Backward
+                        })
+
+        if grad is None:
+            return
+
+        for idx, op in enumerate(block.ops):
+            if is_optimizer_op(op):
+                block._insert_op(
+                    idx,
+                    type='c_sync_comm_stream',
+                    inputs={'X': grad},
+                    outputs={'Out': grad},
+                    attrs={'ring_id': ring_id,
+                           OP_ROLE_KEY: OpRole.Backward})
+                break
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index f4ceb2d287a56..ca3606c16e5d4 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -402,13 +402,18 @@ def get_grad_device(grad_name, shard):
     return shard.global_param2device[base_name]
 
 
-def get_first_check_finite_and_unscale_op_idx(block):
+def get_first_check_finite_and_unscale_op_idx(block, raise_error=True):
 
     for idx, op in enumerate(block.ops):
         if op.type == "check_finite_and_unscale":
             return idx
 
-    raise ValueError("check_finite_and_unscale does not exist in block")
+    if raise_error:
+        raise ValueError(
+            "amp is turned on but check_finite_and_unscale op does not exist in main block"
+        )
+
+    return -1
 
 
 def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 82e54a89e104f..aafb15e0a01f8 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -298,7 +298,7 @@ def minimize_impl(self,
                 print("persistable FP32 grad: ")
                 print(accumulated_grad_names)
                 first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
-                    main_block)
+                    main_block, raise_error=self.user_defined_strategy.amp)
                 insert_reduce_ops(
                     main_block,
                     first_optimize_op_index,
@@ -309,14 +309,15 @@ def minimize_impl(self,
                     use_calc_stream=True)
             if self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp":
                 first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
-                    main_block)
-                insert_allreduce_ops(
-                    main_block,
-                    first_optimize_op_index,
-                    self.dp_ring_id,
-                    accumulated_grad_names,
-                    core.op_proto_and_checker_maker.OpRole.Optimize,
-                    use_calc_stream=True)
+                    main_block, raise_error=self.user_defined_strategy.amp)
+                if first_optimize_op_index >= 0:
+                    insert_allreduce_ops(
+                        main_block,
+                        first_optimize_op_index,
+                        self.dp_ring_id,
+                        accumulated_grad_names,
+                        core.op_proto_and_checker_maker.OpRole.Optimize,
+                        use_calc_stream=True)
 
         # if not use sharding, adapt amp/clip, for remain parallelism.
         # cast --> amp --> clip --> opt
diff --git a/python/paddle/distributed/fleet/meta_parallel/__init__.py b/python/paddle/distributed/fleet/meta_parallel/__init__.py
index ed74d8e744e50..894771a3d5005 100644
--- a/python/paddle/distributed/fleet/meta_parallel/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/__init__.py
@@ -20,7 +20,7 @@
 from .parallel_layers import RNGStatesTracker  # noqa: F401
 from .parallel_layers import model_parallel_random_seed  # noqa: F401
 from .parallel_layers import get_rng_state_tracker  # noqa: F401
-from .model_parallel import ModelParallel  # noqa: F401
+from .tensor_parallel import TensorParallel  # noqa: F401
 from .pipeline_parallel import PipelineParallel  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
index af59b16e22aa8..730a7430133e0 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -41,6 +41,7 @@ def __init__(self,
         self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
 
         self.origin_num_embeddings = num_embeddings
+        self.is_mp = (self.world_size > 1)
 
         per_part_size = (
             num_embeddings + self.world_size - 1) // self.world_size
@@ -50,16 +51,36 @@ def __init__(self,
         per_part_size += 1  # make the last row as the padding index
         self.per_part_size = per_part_size
 
-        self.embedding = paddle.nn.Embedding(
-            per_part_size,
-            embedding_dim,
-            padding_idx=per_part_size - 1,
-            sparse=False,
-            weight_attr=weight_attr,
-            name=name)
-        self.embedding.weight.is_distributed = True
+        self._dtype = self._helper.get_default_dtype()
+        self._size = [per_part_size, embedding_dim]
+        self._weight_attr = weight_attr
+        self._name = name
+
+        if self.is_mp:
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    attr=self._weight_attr,
+                    shape=self._size,
+                    dtype=self._dtype,
+                    is_bias=False)
+            self.weight[per_part_size - 1] = 0.0
+            self.weight.is_distributed = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=[num_embeddings, embedding_dim],
+                dtype=self._dtype,
+                is_bias=False)
 
     def forward(self, x):
+        if not self.is_mp:
+            return F.embedding(
+                x,
+                weight=self.weight,
+                padding_idx=None,
+                sparse=False,
+                name=self._name)
+
         origin_input_shape = x.shape
         if len(origin_input_shape) == 2:
             x = paddle.unsqueeze(x, axis=-1)
@@ -72,13 +93,18 @@ def forward(self, x):
         if len(origin_input_shape) == 2:
             x_shard = paddle.squeeze(x_shard, axis=-1)
 
-        emb_out = self.embedding(x_shard)
-        if self.world_size > 1:
-            emb_out = paddle.distributed.collective._mp_allreduce(
-                emb_out,
-                group=self.model_parallel_group,
-                use_calc_stream=True,
-                use_model_parallel=True)
+        emb_out = F.embedding(
+            x_shard,
+            weight=self.weight,
+            padding_idx=self.per_part_size - 1,
+            sparse=False,
+            name=self._name)
+
+        emb_out = paddle.distributed.collective._mp_allreduce(
+            emb_out,
+            group=self.model_parallel_group,
+            use_calc_stream=True,
+            use_model_parallel=True)
         return emb_out
 
 
@@ -96,8 +122,9 @@ def __init__(self,
         )
         self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
         )
+        self._name = name
+        self.is_mp = (self.world_size > 1)
 
-        self.name = name
         self.gather_output = gather_output
         assert out_features % self.world_size == 0, (
             "Number of column of the weight for linear ({}) must be"
@@ -108,10 +135,20 @@ def __init__(self,
         self._weight_attr = weight_attr
         self._dtype = self._helper.get_default_dtype()
 
-        self.weight = self.create_parameter(
-            shape=[in_features, self.output_size_per_partition],
-            attr=self._weight_attr,
-            dtype=self._dtype)
+        if self.is_mp:
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    shape=[in_features, self.output_size_per_partition],
+                    attr=self._weight_attr,
+                    dtype=self._dtype,
+                    is_bias=False)
+        else:
+            self.weight = self.create_parameter(
+                shape=[in_features, self.output_size_per_partition],
+                attr=self._weight_attr,
+                dtype=self._dtype,
+                is_bias=False)
+
         self.weight.is_distributed = True
 
         if has_bias:
@@ -119,18 +156,24 @@ def __init__(self,
             self.bias = self.create_parameter(
                 shape=[self.output_size_per_partition],
                 attr=paddle.nn.initializer.Constant(value=0.0),
-                dtype=self._dtype)
+                dtype=self._dtype,
+                is_bias=True)
             self.bias.is_distributed = True
         else:
             self.bias = None
 
     def forward(self, x):
         # use inner api to process identity
-        input_parallel = paddle.distributed.collective._c_identity(
-            x, group=self.model_parallel_group)
+        if self.is_mp:
+            input_parallel = paddle.distributed.collective._c_identity(
+                x, group=self.model_parallel_group)
+        else:
+            input_parallel = x
+
         output_parallel = F.linear(
-            input_parallel, self.weight, self.bias, name=self.name)
-        if self.gather_output:
+            input_parallel, self.weight, self.bias, name=self._name)
+
+        if self.gather_output and self.is_mp:
             output = paddle.distributed.collective._c_concat(
                 output_parallel,
                 nranks=self.world_size,
@@ -155,7 +198,7 @@ def __init__(self,
         self.input_is_parallel = input_is_parallel
         self._weight_attr = weight_attr
         self._dtype = self._helper.get_default_dtype()
-        self.name = name
+        self._name = name
 
         self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
         )
@@ -163,6 +206,7 @@ def __init__(self,
         )
         self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
 
+        self.is_mp = (self.world_size > 1)
         assert in_features % self.world_size == 0, (
             "Number of row of the weight for linear ({}) must be"
             " divisible by model parallel size ({})".format(in_features,
@@ -170,22 +214,33 @@ def __init__(self,
 
         self.input_size_per_partition = in_features // self.world_size
 
-        self.weight = self.create_parameter(
-            shape=[self.input_size_per_partition, self.out_features],
-            attr=self._weight_attr,
-            dtype=self._dtype)
+        if self.is_mp:
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    shape=[self.input_size_per_partition, self.out_features],
+                    attr=self._weight_attr,
+                    dtype=self._dtype,
+                    is_bias=False)
+        else:
+            self.weight = self.create_parameter(
+                shape=[self.input_size_per_partition, self.out_features],
+                attr=self._weight_attr,
+                dtype=self._dtype,
+                is_bias=False)
+
         self.weight.is_distributed = True
 
         if has_bias:
             self.bias = self.create_parameter(
                 shape=[self.out_features],
                 attr=paddle.nn.initializer.Constant(value=0.0),
-                dtype=self._dtype)
+                dtype=self._dtype,
+                is_bias=True)
         else:
             self.bias = None
 
     def forward(self, x):
-        if self.input_is_parallel:
+        if self.input_is_parallel or (not self.is_mp):
             input_parallel = x
         else:
             # split last dim
@@ -195,12 +250,16 @@ def forward(self, x):
                 nranks=self.world_size,
                 group=self.model_parallel_group)
 
-        output_parallel = F.linear(input_parallel, self.weight, name=self.name)
-        output_ = paddle.distributed.collective._mp_allreduce(
-            output_parallel,
-            group=self.model_parallel_group,
-            use_calc_stream=True,
-            use_model_parallel=True)
+        output_parallel = F.linear(input_parallel, self.weight, name=self._name)
+
+        if self.is_mp:
+            output_ = paddle.distributed.collective._mp_allreduce(
+                output_parallel,
+                group=self.model_parallel_group,
+                use_calc_stream=True,
+                use_model_parallel=True)
+        else:
+            output_ = output_parallel
 
         output = output_ + self.bias if self.bias is not None else output_
         return output
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
index 41c9deabd1e11..70daa3b25365e 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
@@ -14,6 +14,7 @@
 
 import paddle
 import contextlib
+import numpy as np
 
 __all__ = []
 
@@ -65,14 +66,18 @@ def get_rng_state_tracker():
     return RNG_STATE_TRACKER
 
 
-def model_parallel_random_seed(seed=2048):
+def model_parallel_random_seed(seed=None):
     import paddle.distributed.fleet as fleet
     hcg = fleet.get_hybrid_communicate_group()
     rank = hcg.get_model_parallel_rank()
 
-    local_seed = seed + 1024 + rank
-    global_seed = seed
+    if seed:
+        global_seed = seed
+        local_seed = seed * 1024 + rank * 100
+    else:
+        global_seed = np.random.randint(0, 655350)
+        local_seed = np.random.randint(rank * 10000, (rank + 1) * 10000 - 1)
 
     RNG_STATE_TRACKER.reset()
-    paddle.seed(global_seed)
     RNG_STATE_TRACKER.add(MODEL_PARALLEL_RNG, local_seed)
+    paddle.seed(global_seed)
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index 79e5bc2ffeda0..54324b389336d 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -11,39 +11,29 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
-import time
-import copy
-import os
-
 from types import MethodType
 
-from numpy import prod
-
 import paddle
 import paddle.fluid as fluid
 from .meta_parallel_base import MetaParallelBase
-from .pp_utils.utils import get_tensor_bytes, is_float_tensor
+from .pp_utils.utils import is_float_tensor, get_tensor_dtype, paddle_2_number, number_2_dtype
 from .pp_utils import utils
 from .parallel_layers.pp_layers import PipelineLayer
 
 from ..utils.hybrid_parallel_util import broadcast_mp_parameters
 from ..utils.hybrid_parallel_util import broadcast_dp_parameters
-from ..utils.hybrid_parallel_util import fused_allreduce_gradients
 from ..utils.log_util import logger
+from ..meta_optimizers.dygraph_optimizer import HybridParallelOptimizer
 
 __all__ = []
 
-FLOAT_TYPES = [
-    paddle.float16,
-    paddle.float32,
-    paddle.float64,
-]
-
 
 class PipelineParallel(MetaParallelBase):
     def __init__(self, layers, hcg, strategy):
+        if not isinstance(layers, PipelineLayer):
+            raise TypeError(
+                "The Layer should be a derived class of PipelineLayer.")
         super(PipelineParallel, self).__init__(layers, hcg, strategy)
-
         self.use_pipe_parallel = self._hcg.get_pipe_parallel_world_size() > 1
         self.use_data_parallel = self._hcg.get_data_parallel_world_size() > 1
         self.use_model_parallel = self._hcg.get_model_parallel_world_size() > 1
@@ -63,8 +53,6 @@ def __init__(self, layers, hcg, strategy):
         self.current_loss = paddle.to_tensor(0.0)
         self.total_loss = None
 
-        self.use_amp = self._strategy.amp
-        self.init_loss_scaling = self._strategy.amp_configs['init_loss_scaling']
         self.micro_batch_size = self._strategy.pipeline_configs[
             'micro_batch_size']
         self.accumulate_steps = self._strategy.pipeline_configs[
@@ -75,6 +63,11 @@ def __init__(self, layers, hcg, strategy):
         self.prev_stage_id = self.stage_id - 1
         self.next_stage_id = self.stage_id + 1
         self.pp_group = self._hcg.get_pipe_parallel_group()
+
+        self.is_first_stage = self.stage_id == 0
+        self.is_last_stage = (self.stage_id == (self.num_stages - 1))
+        self.global_rank = self._hcg.get_global_rank()
+
         logger.info("Pipeline Info -- num_stages: {}, stage_id: {}".format(
             self.num_stages, self.stage_id))
 
@@ -83,51 +76,72 @@ def __init__(self, layers, hcg, strategy):
             broadcast_mp_parameters(self._layers, self._hcg)
 
         if self.use_data_parallel:
-            logger.info("start broadcast mp parameters")
+            logger.info("start broadcast dp parameters")
             broadcast_dp_parameters(self._layers, self._hcg)
 
-    def _allocate_caches(self, num_caches):
+    def _init_caches(self, num_caches):
         if self.num_caches >= num_caches:
             return
-
-        num = num_caches - self.num_caches
-        self.num_caches = num_caches
+        self.num_caches = num_caches - self.num_caches
         for key in self.caches:
-            self.caches[key].extend([None] * num)
+            self.caches[key].extend([None] * self.num_caches)
+
+    def _reduce_final_loss(self):
+        if self.is_last_stage:
+            assert self.total_loss is not None, "train_batch() in last stage should obtain vaild loss"
+            loss = self.total_loss.clone() / self.accumulate_steps
+            paddle.distributed.broadcast(
+                loss,
+                src=self.global_rank,
+                use_calc_stream=True,
+                group=self.pp_group)
+        else:
+            loss = paddle.to_tensor(0.0)
+            paddle.distributed.broadcast(
+                loss,
+                src=self._hcg.get_rank_from_stage(self.num_stages - 1),
+                use_calc_stream=True,
+                group=self.pp_group)
+        return loss
 
-    def train_batch(self, data, optimizer):
+    def train_batch(self, data, optimizer, lr_scheduler=None):
+        assert isinstance(optimizer, HybridParallelOptimizer), (
+            'optimizer should be HybridParallelOptimizer subclass.')
         self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
         assert fluid.framework._dygraph_tracer()._has_grad, (
             'Please enable the generation of gradients.')
 
-        if self.stage_id == 0 or self.stage_id == self.num_stages - 1:
-            assert data, (
+        if self.is_first_stage or self.is_last_stage:
+            assert data is not None, (
                 "For the first and the last stage, the data_iter must be set.")
         else:
-            assert data is None, (
-                "For pipe stages other than the first and the last one, "
-                "the data_iter must be None.")
+            data = None
+
         self.data = data
         self._layers.train()
-        self.total_loss = None
-
-        minibatch_cmds = utils.TrainGenerator(self.accumulate_steps,
-                                              self.num_stages, self.stage_id)
-        self._train(minibatch_cmds)
-        return self.total_loss
 
-    def _train(self, minibatch_cmds):
-        self._allocate_caches(self.accumulate_steps)
-        for micro_cmds in minibatch_cmds:
-            for cmd in micro_cmds:
-                assert type(cmd) in self._COMMAND_MAP, "unknow cmd: {}".format(
-                    type(cmd))
-                self._apply_cmd = MethodType(self._COMMAND_MAP[type(cmd)], self)
-                self._apply_cmd(**cmd.kwargs)
-
-    def _allreduce_grads(self):
-        if not self.use_data_parallel: return
-        fused_allreduce_gradients(list(self._layers.parameters()), self._hcg)
+        # store total loss of entire batch
+        self.total_loss = None
+        self._init_caches(self.accumulate_steps)
+        startup_steps = self.num_stages - self.stage_id - 1
+        forward_steps = 0
+        backward_steps = 0
+
+        # forward
+        while (forward_steps < self.accumulate_steps):
+            self._forward(cache_id=forward_steps)
+            forward_steps += 1
+
+        # backward
+        while (backward_steps < self.accumulate_steps):
+            self._backward(cache_id=backward_steps)
+            backward_steps += 1
+
+        # optimizer
+        self._step()
+        self.train_loss = self._reduce_final_loss()
+        return self.train_loss
 
     def _forward(self, cache_id):
         # load data
@@ -140,16 +154,17 @@ def _forward(self, cache_id):
         else:
             inputs = self.caches['inputs'][cache_id]
 
-        self._clear_grads(inputs)
         outputs = self._layers.forward(inputs)
+        self._clear_grads(inputs)
+
         self.caches['outputs'][cache_id] = outputs
 
-        if self.stage_id == self.num_stages - 1:
+        if self.is_last_stage:
             if self._layers._loss_fn is not None:
                 labels = self.caches['labels'][cache_id]
                 outputs = self._layers._loss_fn(outputs, labels)
 
-        if self.stage_id == self.num_stages - 1:
+        if self.is_last_stage:
             self.current_loss = outputs
             if isinstance(self.current_loss, paddle.Tensor):
                 if self.total_loss is None:
@@ -162,18 +177,17 @@ def _forward(self, cache_id):
                     ]
                 for idx, v in enumerate(self.current_loss):
                     self.total_loss[idx] += v.detach()
-            if self.use_data_parallel:
-                self.current_loss = self.current_loss / self._hcg.get_data_parallel_world_size(
-                )
+
             if self.accumulate_steps > 1:
                 self.current_loss = self.current_loss / self.accumulate_steps
+
             self.caches['outputs'][cache_id] = self.current_loss.clone()
+
         else:
             self._send_activations(cache_id)
 
     def _backward(self, cache_id):
-        assert self.optimizer is not None
-        if self.stage_id == self.num_stages - 1:
+        if self.is_last_stage:
             paddle.autograd.backward(self.caches['outputs'][cache_id])
             self._send_gradients(cache_id)
             return
@@ -194,92 +208,89 @@ def _backward(self, cache_id):
         grad_tensors = None
         if self.stage_id != 0: self._send_gradients(cache_id)
         self.caches['outputs'][cache_id] = None
-        #self.caches['backward_tensors'][cache_id] = None
 
-    def _get_data(self):
-        if self.use_model_parallel:
-            mp_rank = self._hcg.get_model_parallel_rank()
+    def _broadcast_data(self, data):
+        if isinstance(data, paddle.Tensor):
+            paddle.distributed.broadcast(
+                data,
+                src=self._hcg.get_model_parallel_group_src_rank(),
+                group=self._hcg.get_model_parallel_group())
         else:
-            mp_rank = 0
-
-        # mp rank 0 loads the data and broadcat it to others.
-        data = self.data
-        if self.use_model_parallel and (self.stage_id == 0 or
-                                        self.stage_id == self.num_stages - 1):
-            assert isinstance(data, (tuple, paddle.Tensor))
-            if isinstance(data, paddle.Tensor):
+            for d in data:
+                assert isinstance(d, paddle.Tensor)
                 paddle.distributed.broadcast(
-                    data,
+                    d,
                     src=self._hcg.get_model_parallel_group_src_rank(),
                     group=self._hcg.get_model_parallel_group())
-            else:
-                data = []
-                for d in self.data:
-                    assert isinstance(d, paddle.Tensor)
-                    paddle.distributed.broadcast(
-                        d,
-                        src=self._hcg.get_model_parallel_group_src_rank(),
-                        group=self._hcg.get_model_parallel_group())
-                    data.append(d)
-            data = tuple(data)
         return data
 
     def _load_micro_batch(self, cache_id):
-        inputs = self._get_data()
-
-        if self.stage_id == 0:
-            data = None
-            #if isinstance(inputs[0], paddle.Tensor):
-            if len(inputs) == 1:
-                assert isinstance(inputs[0], paddle.Tensor)
-                data = inputs[0].clone().detach()
-                #data.stop_gradient = not is_float_tensor(data)
-                data.stop_gradient = True
+        inputs = self.data
+        begin = cache_id * self.micro_batch_size
+        end = begin + self.micro_batch_size
+
+        if self.is_first_stage:
+            assert len(inputs) == 2, "length of input should be 2"
+            if self.use_model_parallel:
+                inputs[0] = self._broadcast_data(inputs[0])
+            if isinstance(inputs[0], tuple):
+                batch_size = inputs[0][0].shape[0]
+                assert self.micro_batch_size * self.accumulate_steps == batch_size, (
+                    "batch_size needs to be divisible by micro_batch_size. Currently, "
+                    "batch_size = %d, micro_batch_size = %d, accumulate_steps = %d."
+                    %
+                    (batch_size, self.micro_batch_size, self.accumulate_steps))
+                data = [
+                    input[begin:end, :].clone().detach() for input in inputs[0]
+                ]
+                self.caches['inputs'][cache_id] = tuple(data)
+            else:
+                batch_size = inputs[0].shape[0]
+                assert self.micro_batch_size * self.accumulate_steps == batch_size
+                self.caches['inputs'][cache_id] = inputs[0][begin:end, :].clone(
+                ).detach()
+        elif self.is_last_stage:
+            assert len(inputs) == 2, "length of input should be 2"
+            if self.use_model_parallel:
+                inputs[1] = self._broadcast_data(inputs[1])
+            if isinstance(inputs[1], tuple):
+                batch_size = inputs[1][0].shape[0]
+                assert self.micro_batch_size * self.accumulate_steps == batch_size
+                data = [
+                    input[begin:end, :].clone().detach() for input in inputs[1]
+                ]
+                self.caches['labels'][cache_id] = tuple(data)
             else:
-                assert isinstance(inputs, tuple)
-                data = []
-                for d in inputs:
-                    assert isinstance(d, paddle.Tensor)
-                    i = d.clone().detach()
-                    #i.stop_gradient = not is_float_tensor(i)
-                    i.stop_gradient = True
-                    data.append(i)
-                data = tuple(data)
-            self.caches['inputs'][cache_id] = data
-
-        if self.stage_id == self.num_stages - 1:
-            labels = None
-            #if isinstance(inputs[1], paddle.Tensor):
-            if len(inputs) == 1:
-                assert isinstance(inputs[0], paddle.Tensor)
-                labels = inputs[0]
-            elif isinstance(inputs, tuple):
-                labels = []
-                for label in inputs:
-                    assert isinstance(label, paddle.Tensor)
-                    label = label.detach()
-                    labels.append(label)
-                labels = tuple(labels)
-            self.caches['labels'][cache_id] = labels
+                batch_size = inputs[1].shape[0]
+                assert self.micro_batch_size * self.accumulate_steps == batch_size
+                self.caches['labels'][cache_id] = inputs[1][begin:end, :].clone(
+                ).detach()
+        else:
+            # No data input is required for other stages
+            inputs = None
 
     def _send_meta(self, data, peer):
-        """
-        % type (0: tensor, 1: tuple)
-        % num_tensors if type=tuple
-        foreach tensor:
-          % ndims
-          % shape
-        """
         if isinstance(data, paddle.Tensor):
             tensor_type = paddle.to_tensor([0])
+            # send tensor type
             paddle.distributed.send(
                 tensor_type, peer, use_calc_stream=True, group=self.pp_group)
+
+            # send len(shape)
             dims = paddle.to_tensor(len(data.shape))
             paddle.distributed.send(
                 dims, peer, use_calc_stream=True, group=self.pp_group)
+
+            # send shape
             shape = paddle.to_tensor(data.shape)
             paddle.distributed.send(
                 shape, peer, use_calc_stream=True, group=self.pp_group)
+
+            # send dtype
+            dtype = paddle.to_tensor(paddle_2_number(data.dtype))
+            paddle.distributed.send(
+                dtype, peer, use_calc_stream=True, group=self.pp_group)
+
         elif isinstance(data, tuple):
             tensor_type = paddle.to_tensor([1])
             paddle.distributed.send(
@@ -289,48 +300,73 @@ def _send_meta(self, data, peer):
                 nums, peer, use_calc_stream=True, group=self.pp_group)
             for idx, d in enumerate(data):
                 assert isinstance(d, paddle.Tensor)
+                # send len(shape)
                 dims = paddle.to_tensor(len(d.shape))
                 paddle.distributed.send(
                     dims, peer, use_calc_stream=True, group=self.pp_group)
+
+                # send shape
                 shape = paddle.to_tensor(d.shape)
                 paddle.distributed.send(
                     shape, peer, use_calc_stream=True, group=self.pp_group)
 
+                # send dtype
+                dtype = paddle.to_tensor(paddle_2_number(d.dtype))
+                paddle.distributed.send(
+                    dtype, peer, use_calc_stream=True, group=self.pp_group)
+
     def _recv_meta(self, peer):
         tensor_type = paddle.to_tensor([0])
         paddle.distributed.recv(
             tensor_type, peer, use_calc_stream=True, group=self.pp_group)
-        tensor_type = tensor_type.numpy()[0]
+        tensor_type = tensor_type.item()
 
         if tensor_type == 0:
+            # recv len(shape)
             dims = paddle.to_tensor([0])
             paddle.distributed.recv(
                 dims, peer, use_calc_stream=True, group=self.pp_group)
-            dims = dims.numpy()[0]
+            dims = dims.item()
+
+            # recv shape
             shape = paddle.to_tensor([0] * dims)
             paddle.distributed.recv(
                 shape, peer, use_calc_stream=True, group=self.pp_group)
             shape = shape.numpy().tolist()
-            return self._allocate_buffer(
-                shape, dtype="float32", num_caches=1)[0]
+
+            # recv dtype
+            dtype = paddle.to_tensor([0])
+            paddle.distributed.recv(
+                dtype, peer, use_calc_stream=True, group=self.pp_group)
+            return self._allocate_cache(
+                shape, dtype=number_2_dtype(dtype.item()), num_caches=1)[0]
         elif tensor_type == 1:
             num = paddle.to_tensor([0])
             paddle.distributed.recv(
                 num, peer, use_calc_stream=True, group=self.pp_group)
-            num = num.numpy()[0]
+            num = num.item()
             shapes = []
+            dtypes = []
             for i in range(num):
+                # recv len(shape)
                 dims = paddle.to_tensor([0])
                 paddle.distributed.recv(
                     dims, peer, use_calc_stream=True, group=self.pp_group)
-                dims = dims.numpy()[0]
+
+                # recv shape
+                dims = dims.item()
                 shape = paddle.to_tensor([0] * dims)
                 paddle.distributed.recv(
                     shape, peer, use_calc_stream=True, group=self.pp_group)
                 shapes.append(shape.numpy().tolist())
 
-            dtypes = ["float32"] * len(shapes)
-            caches = self._allocate_buffers(shapes, dtypes, num_caches=1)[0]
+                # recv dtype
+                dtype = paddle.to_tensor([0])
+                paddle.distributed.recv(
+                    dtype, peer, use_calc_stream=True, group=self.pp_group)
+                dtypes.append(number_2_dtype(dtype.item()))
+
+            caches = self._allocate_caches(shapes, dtypes, num_caches=1)[0]
             caches = tuple(caches)
             return caches
 
@@ -357,7 +393,6 @@ def _send_activations(self, cache_id):
 
     def _send_gradients(self, cache_id):
         inputs = self.caches['inputs'][cache_id]
-
         if isinstance(inputs, paddle.Tensor):
             assert inputs.grad is not None
             paddle.distributed.send(
@@ -371,7 +406,6 @@ def _send_gradients(self, cache_id):
                 if not is_float_tensor(d):
                     assert d.grad is None
                     continue
-                assert d.grad is not None
                 paddle.distributed.send(
                     d.grad,
                     self.prev_stage_id,
@@ -381,8 +415,6 @@ def _send_gradients(self, cache_id):
 
     def _recv_activations(self, cache_id):
         inputs = None
-
-        # Allocate the buffer if necessary
         if self.recv_cache is None:
             self.recv_cache = self._recv_meta(self.prev_stage_id)
 
@@ -419,14 +451,16 @@ def _recv_gradients(self, cache_id):
         if self.grad_tensors is None:
             if isinstance(outputs, paddle.Tensor):
                 s = list(outputs.shape)
-                dtype = 'float16' if self.use_amp else "float32"
-                self.grad_tensors = self._allocate_buffer(
-                    s, dtype, num_buffers=1)[0]
+                dtype = get_tensor_dtype(outputs.dtype)
+                self.grad_tensors = self._allocate_cache(
+                    s, dtype, num_caches=1)[0]
             else:
                 sizes = [list(d.shape) for d in outputs if is_float_tensor(d)]
-                dtypes = ['float16'] * len(
-                    sizes) if self.use_amp else ['float32'] * len(sizes)
-                self.grad_tensors = self._allocate_buffers(
+                dtypes = [
+                    get_tensor_dtype(d.dtype) for d in outputs
+                    if is_float_tensor(d)
+                ]
+                self.grad_tensors = self._allocate_caches(
                     sizes, dtypes, num_caches=1)[0]
 
         if isinstance(self.grad_tensors, paddle.Tensor):
@@ -445,9 +479,10 @@ def _recv_gradients(self, cache_id):
                     group=self.pp_group)
 
     def _step(self):
-        self._allreduce_grads()
         self.optimizer.step()
-        self.optimizer.clear_gradients()
+        self.optimizer.clear_grad()
+        if self.lr_scheduler:
+            self.lr_scheduler.step()
 
     def _clear_grads(self, inputs):
         if isinstance(inputs, paddle.Tensor):
@@ -461,7 +496,7 @@ def _clear_grads(self, inputs):
     def _allocate_zeros(self, shape, dtype):
         return paddle.zeros(shape, dtype)
 
-    def _allocate_buffer(self, shape, dtype, num_caches=-1):
+    def _allocate_cache(self, shape, dtype, num_caches=-1):
         caches = []
         if num_caches == -1:
             num_caches = self.num_caches
@@ -469,7 +504,7 @@ def _allocate_buffer(self, shape, dtype, num_caches=-1):
             caches.append(self._allocate_zeros(shape, dtype))
         return caches
 
-    def _allocate_buffers(self, shapes, dtypes, num_caches=-1):
+    def _allocate_caches(self, shapes, dtypes, num_caches=-1):
         caches = []
         if num_caches == -1:
             num_caches = self.num_caches
@@ -488,11 +523,5 @@ def load_state_dict(self, model_path):
         state_dict = paddle.load(self.model_path)
         self._layers.set_state_dict(state_dict)
 
-    _COMMAND_MAP = {
-        utils.Optimize: _step,
-        utils.Forward: _forward,
-        utils.Backward: _backward,
-    }
-
     def forward(self, *inputs, **kwargs):
         raise RuntimeError("Call train_batch for pipeline instead of forward.")
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
index e5c5709f98d95..8c204820b1661 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -14,20 +14,51 @@
 
 import abc
 import paddle
-from ...utils import hybrid_parallel_util as hp_util
+from ...utils import log_util as hp_util
 
 __all__ = []
 
-FLOAT_TYPES = [
-    paddle.float16,
-    paddle.float32,
-    paddle.float64,
-]
+FLOAT_TYPE_DICT = {
+    paddle.float16: "float16",
+    paddle.float32: "float32",
+    paddle.float64: "float64",
+}
+
+PADDLE_TO_NUMBER = {
+    paddle.float16: 0,
+    paddle.float32: 1,
+    paddle.float64: 2,
+    paddle.int32: 3,
+    paddle.int64: 4
+}
+
+NUMBER_TO_DTYPE = {
+    0: "float16",
+    1: "float32",
+    2: "float64",
+    3: "int32",
+    4: "int64"
+}
 
 
 def is_float_tensor(tensor):
     """Is a float tensor"""
-    return tensor.dtype in FLOAT_TYPES
+    return tensor.dtype in FLOAT_TYPE_DICT.keys()
+
+
+def get_tensor_dtype(dtype):
+    assert dtype in FLOAT_TYPE_DICT.keys()
+    return FLOAT_TYPE_DICT[dtype]
+
+
+def paddle_2_number(dtype):
+    assert dtype in PADDLE_TO_NUMBER.keys()
+    return PADDLE_TO_NUMBER[dtype]
+
+
+def number_2_dtype(number):
+    assert number in NUMBER_TO_DTYPE.keys()
+    return NUMBER_TO_DTYPE[number]
 
 
 def get_tensor_bytes(tensor):
@@ -48,78 +79,3 @@ def get_tensor_bytes(tensor):
     else:
         raise ValueError("unknown data type: {}".format(tensor.dtype))
     return tensor.numel() * elem_size
-
-
-class Generator():
-    def __init__(self, micro_batches, stages, stage_id):
-        __metaclass__ = abc.ABCMeta
-
-        self.micro_batches = micro_batches
-        self.stages = stages
-        self.stage_id = stage_id
-        self.prev_stage = self.stage_id - 1
-        self.next_stage = self.stage_id + 1
-
-    @abc.abstractmethod
-    def generate(self):
-        pass
-
-    def __iter__(self):
-        self.iter = None
-        return self
-
-    def __next__(self):
-        if self.iter is None:
-            self.iter = self.generate()
-        return next(self.iter)
-
-
-class TrainGenerator(Generator):
-    def generate(self):
-        startup_steps = self.stages - self.stage_id - 1
-        cmds = []
-        forward_steps = 0
-        backward_steps = 0
-        #while (forward_steps < startup_steps):
-        #    cmds.append(Forward(cache_id=forward_steps))
-        #    forward_steps += 1
-        #while (forward_steps < self.micro_batches):
-        #    cmds.append(Forward(cache_id=forward_steps))
-        #    forward_steps += 1
-        #    cmds.append(Backward(cache_id=backward_steps))
-        #    backward_steps += 1
-        #while (backward_steps < self.micro_batches):
-        #    cmds.append(Backward(cache_id=backward_steps))
-        #    backward_steps += 1
-        #cmds.append(Optimize())
-        while (forward_steps < self.micro_batches):
-            cmds.append(Forward(cache_id=forward_steps))
-            forward_steps += 1
-        while (backward_steps < self.micro_batches):
-            cmds.append(Backward(cache_id=backward_steps))
-            backward_steps += 1
-        cmds.append(Optimize())
-        yield cmds
-
-
-class Command:
-    def __init__(self, **kwargs):
-        self.name = self.__class__.__name__
-        self.kwargs = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-
-    def __repr__(self):
-        return hp_util.call_to_str(self.name, **self.kwargs)
-
-
-class Optimize(Command):
-    pass
-
-
-class Forward(Command):
-    pass
-
-
-class Backward(Command):
-    pass
diff --git a/python/paddle/distributed/fleet/meta_parallel/model_parallel.py b/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
similarity index 89%
rename from python/paddle/distributed/fleet/meta_parallel/model_parallel.py
rename to python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
index 682d7152a42bd..1dbf668d6e13a 100644
--- a/python/paddle/distributed/fleet/meta_parallel/model_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
@@ -22,15 +22,15 @@
 __all__ = []
 
 
-class ModelParallel(MetaParallelBase):
+class TensorParallel(MetaParallelBase):
     def __init__(self, layers, hcg, **kwargs):
-        super(ModelParallel, self).__init__(layers, hcg, **kwargs)
+        super(TensorParallel, self).__init__(layers, hcg, **kwargs)
 
     def _prepare_for_model(self):
         logger.info("start broadcast mp parameters")
         broadcast_mp_parameters(self._layers, self._hcg)
 
-        logger.info("start broadcast mp parameters")
+        logger.info("start broadcast dp parameters")
         broadcast_dp_parameters(self._layers, self._hcg)
 
         logger.info("mp's parameters is ready")
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index ce68eb9a1fb4a..d31fa549ad562 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -32,7 +32,7 @@ def conv_indent(indent):
     return "".join([" "] * indent)
 
 
-PSERVER_SAVE_SUFFIX = "_txt"
+PSERVER_SAVE_SUFFIX = ".shard"
 
 
 class Accessor:
@@ -916,7 +916,7 @@ def _save_sparse_params(self, executor, dirname, context, main_program,
             self.compiled_strategy.origin_main_program, True)
         values = []
         for id, names in context.items():
-            if names not in distributed_varnames:
+            if names[0] not in distributed_varnames:
                 # only save sparse param to local
                 self._worker.recv_and_save_model(id, dirname)
             # save sparse & distributed param on server
@@ -953,11 +953,11 @@ def _save_distributed_persistables(self,
                 TheOnePSRuntime.__exclude_vars(saved_varnames),
                 main_program.list_vars()))
 
-        fluid.io.save_vars(
-            executor,
-            main_program=main_program,
-            dirname=dirname,
-            vars=remaining_vars)
+        import paddle
+        for var in remaining_vars:
+            tensor = var.get_value()
+            paddle.save(
+                tensor, os.path.join(dirname, var.name), use_binary_format=True)
 
     def _ps_inference_save_persistables(self,
                                         executor,
@@ -978,20 +978,19 @@ def _ps_inference_save_persistables(self,
 
         if isinstance(executor, ParallelExecutor):
             raise TypeError(
-                "in fleet.save_persistables() function, executor must be as Executor type, ParallelExecutor is not allowed"
+                "in fleet.save() function, executor must be as Executor type, ParallelExecutor is not allowed"
             )
 
         if not isinstance(executor, Executor):
             raise TypeError(
-                "in fleet.save_persistables() function, executor must be as Executor type"
-            )
+                "in fleet.save() function, executor must be as Executor type")
 
         if main_program is None:
             main_program = self.compiled_strategy.get_origin_ps_main_program()
 
         if isinstance(main_program, CompiledProgram):
             raise TypeError(
-                "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
+                "in fleet.save() function, main_program must be as Program type, CompiledProgram is not allowed"
             )
 
         # Todo(MrChengmo): Save optimizer status
@@ -1013,37 +1012,36 @@ def _ps_inference_save_inference_model(self,
 
         if isinstance(executor, ParallelExecutor):
             raise TypeError(
-                "in fleet.save_inference_model() function, executor must be as Executor type, ParallelExecutor is not allowed"
+                "in fleet.save() function, executor must be as Executor type, ParallelExecutor is not allowed"
             )
 
         if not isinstance(executor, Executor):
             raise TypeError(
-                "in fleet.save_inference_model() function, executor must be as Executor type"
+                "in fleet.save() function, executor must be as Executor type")
+
+        import paddle
+        program = self.origin_main_program if main_program is None else main_program
+
+        if isinstance(program, CompiledProgram):
+            raise TypeError(
+                "in fleet.save() function, main_program must be as Program type, CompiledProgram is not allowed"
             )
 
-        if main_program is not None:
-            if isinstance(main_program, CompiledProgram):
-                raise TypeError(
-                    "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed"
-                )
-            fluid.io.save_inference_model(dirname, feeded_var_names,
-                                          target_vars, executor, main_program,
-                                          None, None, export_for_deployment)
-        else:
-            fluid.io.save_inference_model(dirname, feeded_var_names,
-                                          target_vars, executor,
-                                          self.origin_main_program, None, None,
-                                          export_for_deployment, True)
-            model_basename = "__model__"
-            model_filename = os.path.join(dirname, model_basename)
-
-            with open(model_filename, "rb") as f:
-                program_desc_str = f.read()
-
-            program = Program.parse_from_string(program_desc_str)
-            program._copy_dist_param_info_from(fluid.default_main_program())
-            self._ps_inference_save_persistables(executor, dirname, program,
-                                                 mode)
+        feed_vars = [
+            program.global_block().var(name) for name in feeded_var_names
+        ]
+
+        infer_program = paddle.static.normalize_program(program, feed_vars,
+                                                        target_vars)
+
+        infer_program._copy_dist_param_info_from(program)
+
+        model_basename = "__model__"
+        model_basename = os.path.join(dirname, model_basename)
+        paddle.save(infer_program, model_basename)
+
+        self._ps_inference_save_persistables(executor, dirname, infer_program,
+                                             mode)
 
     def _save_inference_model(self, *args, **kwargs):
         self._ps_inference_save_inference_model(*args, **kwargs)
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index 5521bd5b95283..ddbd6111b4609 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -44,7 +44,15 @@ def _apply_collective_grads(parameters, comm_group):
 
     for coalesced_grad, _, _ in coalesced_grads_and_vars:
         # need to div nranks
-        coalesced_grad = coalesced_grad / comm_group.nranks
+        div_factor = paddle.to_tensor(
+            comm_group.nranks, dtype=coalesced_grad.dtype)
+        paddle.fluid.framework._dygraph_tracer().trace_op(
+            type="elementwise_div",
+            inputs={'X': coalesced_grad,
+                    'Y': div_factor},
+            outputs={'Out': coalesced_grad},
+            attrs={'axis': -1})
+
         paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
 
     _split_tensors(coalesced_grads_and_vars)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index 004e1c1aa9bc5..491f8a7e25cbc 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -39,7 +39,7 @@
 
 fake_quantize_dequantize_types = [
     "fake_quantize_dequantize_abs_max",
-    "fake_quantize_dequantize_channel_wise_abs_max",
+    "fake_channel_wise_quantize_dequantize_abs_max",
     "fake_quantize_dequantize_moving_average_abs_max"
 ]
 
diff --git a/python/paddle/fluid/contrib/slim/tests/README.md b/python/paddle/fluid/contrib/slim/tests/README.md
index 169cb686168f8..8688c96b7bd47 100644
--- a/python/paddle/fluid/contrib/slim/tests/README.md
+++ b/python/paddle/fluid/contrib/slim/tests/README.md
@@ -207,13 +207,29 @@ Run the following commands to download and extract Quant model:
 ```bash
 mkdir -p /PATH/TO/DOWNLOAD/MODEL/
 cd /PATH/TO/DOWNLOAD/MODEL/
-export QUANT_MODEL_NAME=resnet50
-export QUANT_MODEL_ARCHIVE=${QUANT_MODEL_NAME}_quant.tar.gz
-wget http://paddle-inference-dist.bj.bcebos.com/int8/QAT2_models/${QUANT_MODEL_ARCHIVE}
+export QUANT_MODEL_NAME=ResNet50
+export QUANT_MODEL_ARCHIVE=${QUANT_MODEL_NAME}_qat_model.tar.gz
+wget http://paddle-inference-dist.bj.bcebos.com/int8/QAT_models/${QUANT_MODEL_ARCHIVE}
 mkdir ${QUANT_MODEL_NAME} && tar -xvf ${QUANT_MODEL_ARCHIVE} -C ${QUANT_MODEL_NAME}
 ```
 
-To download other Quant models, set the `QUANT_MODEL_NAME` variable in the above commands to one of the values: `resnet101`, `mobilenetv1`, `mobilenetv2`, `vgg16`, `vgg19`.
+To download other Quant models, set the `QUANT_MODEL_NAME` variable in the above commands to one of the values: `ResNet101`, `MobileNetV1`, `MobileNetV2`, `VGG16`, `VGG19`.
+
+Moreover, there are other variations of these Quant models that use different methods to obtain scales during training, run these commands to download and extract Quant model:
+
+```bash
+mkdir -p /PATH/TO/DOWNLOAD/MODEL/
+cd /PATH/TO/DOWNLOAD/MODEL/
+export QUANT_MODEL_NAME=ResNet50_qat_perf
+export QUANT_MODEL_ARCHIVE=${QUANT_MODEL_NAME}.tar.gz
+wget http://paddle-inference-dist.bj.bcebos.com/int8/QAT_models/${QUANT_MODEL_ARCHIVE}
+mkdir ${QUANT_MODEL_NAME} && tar -xvf ${QUANT_MODEL_ARCHIVE} -C ${QUANT_MODEL_NAME}
+```
+
+To download other Quant models, set the `QUANT_MODEL_NAME` variable to on of the values: `ResNet50_qat_perf`, `ResNet50_qat_range`, `ResNet50_qat_channelwise`, `MobileNet_qat_perf`, where:
+- `ResNet50_qat_perf`, `MobileNet_qat_perf` with input/output scales in `fake_quantize_moving_average_abs_max` operators, with weight scales in `fake_dequantize_max_abs` operators
+- `ResNet50_qat_range`, with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes, with weight scales in `fake_dequantize_max_abs` operators
+- `ResNet50_qat_channelwise`, with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes, with weight scales in `fake_channel_wise_dequantize_max_abs` operators
 
 Download clean FP32 model for accuracy comparison against the INT8 model:
 
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 52ab83698592a..1f928bfc8a689 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -289,10 +289,14 @@ def __init__(self, loader):
 
         # if user exit python program when dataloader is still
         # iterating, resource may no release safely, so we
-        # add __del__ function to to CleanupFuncRegistrar
-        # to make sure __del__ is always called when program
+        # add _shutdown_on_exit function to to CleanupFuncRegistrar
+        # to make sure _try_shutdown_all is always called when program
         # exit for resoure releasing safely
-        CleanupFuncRegistrar.register(self.__del__)
+        # worker join may hang for in _try_shutdown_all call in atexit
+        # for main process is in atexit state in some OS, so we add
+        # timeout=1 for shutdown function call in atexit, for shutdown
+        # function call in __del__, we keep it as it is
+        CleanupFuncRegistrar.register(self._shutdown_on_exit)
 
     def _init_workers(self):
         # multiprocess worker and indice queue list initial as empty
@@ -363,7 +367,7 @@ def _shutdown_worker(self, worker_id):
             self._indices_queues[worker_id].put(None)
             self._worker_status[worker_id] = False
 
-    def _try_shutdown_all(self):
+    def _try_shutdown_all(self, timeout=None):
         if not self._shutdown:
             try:
                 self._exit_thread_expectedly()
@@ -376,11 +380,12 @@ def _try_shutdown_all(self):
                 for i in range(self._num_workers):
                     self._shutdown_worker(i)
 
-                for w in self._workers:
-                    w.join()
-                for q in self._indices_queues:
-                    q.cancel_join_thread()
-                    q.close()
+                if not self._shutdown:
+                    for w in self._workers:
+                        w.join(timeout)
+                    for q in self._indices_queues:
+                        q.cancel_join_thread()
+                        q.close()
             finally:
                 core._erase_process_pids(id(self))
                 self._shutdown = True
@@ -560,6 +565,9 @@ def _try_put_indices(self):
     def __del__(self):
         self._try_shutdown_all()
 
+    def _shutdown_on_exit(self):
+        self._try_shutdown_all(1)
+
     def __next__(self):
         try:
             # _batches_outstanding here record the total batch data number
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index 86c63ababbbfd..2b9d512856005 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -74,6 +74,7 @@ def __init__(self):
         self.dataset = core.Dataset("MultiSlotDataset")
         self.thread_num = 1
         self.filelist = []
+        self.use_ps_gpu = False
 
     def set_pipe_command(self, pipe_command):
         """
@@ -251,9 +252,11 @@ def set_use_var(self, var_list):
                 slot_var.type = "float"
             elif var.dtype == core.VarDesc.VarType.INT64:
                 slot_var.type = "uint64"
+            elif var.dtype == core.VarDesc.VarType.INT32:
+                slot_var.type = "uint32"
             else:
                 raise ValueError(
-                    "Currently, fluid.dataset only supports dtype=float32 and dtype=int64"
+                    "Currently, fluid.dataset only supports dtype=float32, dtype=int32 and dtype=int64"
                 )
 
     def set_hdfs_config(self, fs_name, fs_ugi):
@@ -300,6 +303,15 @@ def _prepare_to_run(self):
         self.dataset.set_data_feed_desc(self.desc())
         self.dataset.create_readers()
 
+    def _set_use_ps_gpu(self, use_ps_gpu):
+        """
+        set use_ps_gpu flag
+
+        Args:
+            use_ps_gpu: bool
+        """
+        self.use_ps_gpu = use_ps_gpu
+
     def _finish_to_run(self):
         self.dataset.destroy_readers()
 
@@ -391,7 +403,10 @@ def _prepare_to_run(self):
     )
     def _dynamic_adjust_before_train(self, thread_num):
         if not self.is_user_set_queue_num:
-            self.dataset.dynamic_adjust_channel_num(thread_num, False)
+            if self.use_ps_gpu:
+                self.dataset.dynamic_adjust_channel_num(thread_num, True)
+            else:
+                self.dataset.dynamic_adjust_channel_num(thread_num, False)
         self.dataset.dynamic_adjust_readers_num(thread_num)
 
     @deprecated(
@@ -400,7 +415,10 @@ def _dynamic_adjust_before_train(self, thread_num):
     )
     def _dynamic_adjust_after_train(self):
         if not self.is_user_set_queue_num:
-            self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
+            if self.use_ps_gpu:
+                self.dataset.dynamic_adjust_channel_num(self.thread_num, True)
+            else:
+                self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
         self.dataset.dynamic_adjust_readers_num(self.thread_num)
 
     @deprecated(
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index be5d9ac58311b..c8e1370e44772 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -63,37 +63,52 @@ def program_desc_tracing_guard(enable):
 
 @signature_safe_contextmanager
 def param_guard(parameters):
+    from paddle.fluid.dygraph.dygraph_to_static.program_translator import in_declarative_mode
     # Note: parameters is a reference of self._parameters or self._buffers
-    if not framework.in_dygraph_mode() and parameters:
+    if in_declarative_mode() and not framework.in_dygraph_mode() and parameters:
         origin_parameters = parameters.copy()
         for name, var_base in parameters.items():
-            if isinstance(var_base, core.VarBase):
-                # Convert ParamBase into Parameter with same attributes in dy2stat.
-                if isinstance(var_base, framework.ParamBase):
-                    new_var = var_base._to_static_var(to_parameter=True)
-                else:
-                    # Check whether has been created before.
-                    if var_base.name in var_base.block.vars:
-                        new_var = var_base.block.vars[var_base.name]
-                    # Note(Aurelius84): Convert VarBase in self._buffers into Variabe with
-                    # same attributes and set persistable=True to allow saving this var.
-                    # Because users can create a VarBase in `__init__`  like a
-                    # `mask` Tensor or `hidden_0` in RNN layers, which is equivalent to a Parameter
-                    # and necessary for inferring. It will be pruned if it's not necessary for inferring.
-                    else:
-                        # But if its shape is empty while created from `create_variable()`, we consider this buffer
-                        # non-persistable. See case of `drop_state` in lstm api.
-                        is_persistable = len(var_base.shape) > 0
-
-                        new_var = var_base._to_static_var(
-                            to_parameter=False, persistable=is_persistable)
-                parameters[name] = new_var
+            if isinstance(var_base, list):
+                new_var = [_convert_into_variable(var) for var in var_base]
+            else:
+                new_var = _convert_into_variable(var_base)
+            parameters[name] = new_var
         yield
         parameters.update(origin_parameters)
     else:
         yield
 
 
+def _convert_into_variable(var_base):
+    """
+    Convert Varbase into Variable.
+    """
+    if isinstance(var_base, core.VarBase):
+        # Check whether has been created before.
+        new_var = var_base.block._find_var_recursive(var_base.name)
+        if new_var is not None:
+            assert isinstance(new_var, framework.Variable)
+        # Convert ParamBase into Parameter with same attributes in dy2stat.
+        elif isinstance(var_base, framework.ParamBase):
+            new_var = var_base._to_static_var(to_parameter=True)
+        else:
+            # Note(Aurelius84): Convert VarBase in self._buffers into Variable with
+            # same attributes and set persistable=True to allow saving this var.
+            # Because users can create a VarBase in `__init__`  like a
+            # `mask` Tensor or `hidden_0` in RNN layers, which is equivalent to a Parameter
+            # and necessary for inferring. It will be pruned if it's not necessary for inferring.
+
+            # But if its shape is empty while created from `create_variable()`, we consider this buffer
+            # non-persistable. See case of `drop_state` in lstm api.
+            is_persistable = len(var_base.shape) > 0
+
+            new_var = var_base._to_static_var(
+                to_parameter=False, persistable=is_persistable)
+        return new_var
+    else:
+        return var_base
+
+
 def enabled():
     """
     This function checks whether the program runs in dynamic graph mode or not.
@@ -664,7 +679,7 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
         if isinstance(framework._current_expected_place(),
                       framework.core.CPUPlace):
             #TODO(zhiqiu): we found two problems when enable zero_copy on CPUPlace.
-            # (1): eigen requires 16-bytes alignments, but the data of numpy array may not statisfy. 
+            # (1): eigen requires 16-bytes alignments, but the data of numpy array may not statisfy.
             # Details: https://eigen.tuxfamily.org/dox/group__TopicUnalignedArrayAssert.html
             # (2): when used in flask framework, it may result in hang.
             # Details: https://github.com/PaddlePaddle/Paddle/issues/26635
diff --git a/python/paddle/fluid/dygraph/container.py b/python/paddle/fluid/dygraph/container.py
index c7ea412fec1b7..2938516e5bc44 100644
--- a/python/paddle/fluid/dygraph/container.py
+++ b/python/paddle/fluid/dygraph/container.py
@@ -15,6 +15,7 @@
 from collections import OrderedDict
 from ..framework import Parameter
 from .layers import Layer
+from .base import param_guard
 
 __all__ = [
     'Sequential',
@@ -159,7 +160,8 @@ def __init__(self, parameters=None):
                 self.add_parameter(str(idx), param)
 
     def __getitem__(self, idx):
-        return self._parameters[str(idx)]
+        with param_guard(self._parameters):
+            return self._parameters[str(idx)]
 
     def __setitem__(self, idx, param):
         assert isinstance(param, Parameter)
@@ -169,7 +171,8 @@ def __len__(self):
         return len(self._parameters)
 
     def __iter__(self):
-        return iter(self._parameters.values())
+        with param_guard(self._parameters):
+            return iter(self._parameters.values())
 
     def append(self, parameter):
         """Appends a given parameter at the end of the list.
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index 7604be2d838eb..a621f68c6545a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -26,6 +26,7 @@
 import numpy
 import six
 
+from paddle.fluid.dygraph.container import Sequential
 from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import TranslatorLogger
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticFunction
@@ -40,6 +41,9 @@
 BUILTIN_LIKELY_MODULES = [
     collections, pdb, copy, inspect, re, six, numpy, logging
 ]
+# The api(s) should be considered as plain function and convert
+# them into static layer code.
+PADDLE_NEED_CONVERT_APIS = [Sequential]
 
 translator_logger = TranslatorLogger()
 
@@ -92,6 +96,10 @@ def is_unsupported(func):
                     format(func))
                 return True
 
+    # NOTE: should be placed before `is_paddle_func`
+    if type(func) in PADDLE_NEED_CONVERT_APIS:
+        return False
+
     if is_paddle_func(func):
         translator_logger.log(
             2,
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
index 4b3b9fcf29885..cbe6b8a0ff942 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
@@ -368,5 +368,8 @@ def _get_node_var_type(self, cur_wrapper):
 
             if isinstance(node.func, gast.Name):
                 return self.var_env.get_var_type(node.func.id)
+        if isinstance(node, gast.Subscript):
+            if self.is_tensor_node(node.value):
+                return {NodeVarType.TENSOR}
 
         return {NodeVarType.STATEMENT}
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 18dfff434a2aa..ecf6be1a0224a 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -34,7 +34,7 @@
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
 from paddle.fluid.executor import Executor, global_scope
-from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.dygraph import no_grad
 import paddle.utils.deprecated as deprecated
@@ -873,6 +873,10 @@ def _build_once(self, *args, **kwargs):
         pass
 
     def __call__(self, *inputs, **kwargs):
+        # NOTE(Aurelius84): Why we still need param_guard here?
+        # In case of ControlFlow, true_fn and false_fn will contain
+        # parameters that may not trigger logic of `Operator` to create
+        # them. we add this to make sure all parameters is available.
         with param_guard(self._parameters), param_guard(self._buffers):
             for forward_pre_hook in self._forward_pre_hooks.values():
                 hook_result = forward_pre_hook(self, inputs)
@@ -1427,8 +1431,19 @@ def transform(t, device, dtype, blocking):
                 dtype = t.dtype
 
             new_t = t._copy_to(device, blocking)
-            if dtype is not None and dtype != t.dtype:
-                new_t = new_t.cast(dtype=dtype)
+            if isinstance(t, framework.ParamBase):
+                if dtype is not None and dtype != t.dtype:
+                    framework._dygraph_tracer().trace_op(
+                        type='cast',
+                        inputs={'X': new_t},
+                        outputs={'Out': new_t},
+                        attrs={
+                            'in_dtype': t.dtype,
+                            'out_dtype': convert_np_dtype_to_dtype_(dtype)
+                        })
+            else:
+                if dtype is not None and dtype != t.dtype:
+                    new_t = new_t.cast(dtype=dtype)
 
             return new_t
 
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index ca5e5606e432b..2be062962ec9d 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -417,14 +417,15 @@ class DataParallel(layers.Layer):
                                                 Note that setting the find_unused_parameters to True 
                                                 will affect computing performance. Therefore, if all parameters
                                                 are sure to participate in the loss calculation and the 
-                                                autograd graph construction, please set it False. Default: True.
+                                                autograd graph construction, please set it False. Default: False.
             
     Returns:
         Layer: The data paralleled module.
 
     Examples:
         .. code-block:: python
-
+        
+            # required: distributed
             import paddle
             import paddle.nn as nn
             import paddle.optimizer as opt
@@ -474,7 +475,7 @@ def __init__(self,
                  strategy=None,
                  comm_buffer_size=25,
                  last_comm_buffer_size=1,
-                 find_unused_parameters=True):
+                 find_unused_parameters=False):
         super(DataParallel,
               self).__init__(layers.full_name() + "_data_parallel")
 
@@ -576,12 +577,8 @@ def _find_varbase(self, obj):
     def forward(self, *inputs, **kwargs):
         outputs = self._layers(*inputs, **kwargs)
         if self._strategy.nranks > 1 and framework._dygraph_tracer()._has_grad:
-            if self.find_unused_parameters:
-                self._reducer.prepare_for_backward(
-                    list(self._find_varbase(outputs)))
-            else:
-                self._reducer.prepare_for_backward(list(self._find_varbase([])))
-
+            self._reducer.prepare_for_backward(
+                list(self._find_varbase(outputs)))
         return outputs
 
     @deprecated(
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 37900b7880a35..644e25ab9183b 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -86,7 +86,7 @@ def _to_static_var(self, to_parameter=False, **kwargs):
 
         """
 
-        # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph. 
+        # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
         # It will fail. So, for propery in dygraph only, should not let it getattr(self, attr, None).
         attr_not_need_keys = ['grad']
         if isinstance(self, ParamBase):
@@ -108,6 +108,8 @@ def _to_static_var(self, to_parameter=False, **kwargs):
 
         if to_parameter or isinstance(self, ParamBase):
             del attr_kwargs['persistable']
+            # NOTE(Aurelius84): All parameters should be placed into global block.
+            attr_kwargs['block'] = attr_kwargs['block'].program.global_block()
             static_var = Parameter(**attr_kwargs)
         else:
             static_var = Variable(**attr_kwargs)
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 62a9c42ee0a61..620729795bc20 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1507,6 +1507,9 @@ def _run_from_dataset(self,
         trainer._gen_trainer_desc()
 
         self._dump_debug_info(program=program, trainer=trainer)
+        # in case of calling _set_use_ps_gpu explicitly
+        if dataset.use_ps_gpu is False:
+            dataset._set_use_ps_gpu(trainer.proto_desc.use_ps_gpu)
         dataset._dynamic_adjust_before_train(trainer.proto_desc.thread_num)
 
         trainer_instance = self._default_executor.init_for_dataset(
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 2eac5adcf226c..a858ba783428e 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -39,6 +39,7 @@
 import paddle.version as fluid_version
 import warnings
 import functools
+from .variable_index import _getitem_impl_, _setitem_impl_
 
 __all__ = [
     'Program',
@@ -778,205 +779,6 @@ def __instancecheck__(cls, instance):
             return issubclass(t, Parameter)
 
 
-def _getitem_impl_(var, item):
-    """
-    Slice the variable.
-
-    Args:
-        item(int/slice/tuple) : the index.
-
-    Returns:
-        Sliced variable
-    """
-
-    if not isinstance(item, tuple):
-        item = [item]
-
-    decrease_axis = []
-    slice_axis = []
-    slice_start = []
-    slice_end = []
-    slice_step = []
-    use_strided_slice = False
-    reverse_axis = []
-    target_block = default_main_program().current_block()
-
-    def fill_constant(shape, value, force_cpu=False, out=None):
-        var.block.append_op(
-            type='fill_constant',
-            inputs={},
-            outputs={'Out': [out]},
-            attrs={
-                'shape': shape,
-                'dtype': out.dtype,
-                'value': float(value),
-                'force_cpu': force_cpu
-            })
-        out.stop_gradient = True
-        return out
-
-    for dim, slice_item in enumerate(item):
-        if isinstance(slice_item, slice):
-            start = slice_item.start
-            end = slice_item.stop
-            step = slice_item.step
-
-            if start is None and end is None and step is None:
-                continue
-
-            if step is None:
-                step = 1
-
-            if start is None and end is None:
-                assert (step == -1)
-                reverse_axis.append(dim)
-                continue
-
-            if start is None:
-                start = 0
-
-            if end is None:
-                end = 10000000
-
-            if step != 1:
-                use_strided_slice = True
-
-            slice_axis.append(dim)
-            slice_start.append(start)
-            slice_end.append(end)
-            slice_step.append(step)
-        else:
-            decrease_axis.append(dim)
-            slice_axis.append(dim)
-            slice_start.append(slice_item)
-            slice_step.append(1)
-            if isinstance(slice_item, Variable):
-                temp_1 = var.block.create_var(dtype=slice_item.dtype)
-                fill_constant([1], 1, force_cpu=True, out=temp_1)
-                temp_end = target_block.create_var(dtype=slice_item.dtype)
-                target_block.append_op(
-                    type='elementwise_add',
-                    inputs={'X': slice_item,
-                            'Y': temp_1},
-                    outputs={'Out': temp_end},
-                    attrs={'axis': -1})
-                slice_end.append(temp_end)
-            else:
-                slice_end.append(slice_item + 1
-                                 if slice_item != -1 else 10000000)
-
-    def contain_var(one_list):
-        for ele in one_list:
-            if isinstance(ele, Variable):
-                return True
-        return False
-
-    def get_new_list_tensor(old_list):
-        new_list_tensor = []
-        for dim in old_list:
-            if isinstance(dim, Variable):
-                dim.stop_gradient = True
-                new_list_tensor.append(dim)
-            else:
-                assert (isinstance(dim, int))
-                temp_out = var.block.create_var(dtype='int64')
-                fill_constant([1], dim, force_cpu=True, out=temp_out)
-                new_list_tensor.append(temp_out)
-        return new_list_tensor
-
-    inputs = {'Input': [var]}
-    attrs = {
-        'axes': slice_axis,
-        'starts': [],
-        'ends': [],
-        'decrease_axis': decrease_axis
-    }
-    if (use_strided_slice == True):
-        attrs['strides'] = []
-    infer_flags = list(1 for i in range(len(slice_axis)))
-
-    # starts
-    if contain_var(slice_start):
-        inputs['StartsTensorList'] = get_new_list_tensor(slice_start)
-        for i, dim in enumerate(slice_start):
-            if isinstance(dim, Variable):
-                attrs['starts'].append(-1)
-                infer_flags[i] = -1
-            else:
-                attrs['starts'].append(dim)
-    else:
-        attrs['starts'] = slice_start
-
-    # ends
-    if contain_var(slice_end):
-        inputs['EndsTensorList'] = get_new_list_tensor(slice_end)
-        for i, dim in enumerate(slice_end):
-            if isinstance(dim, Variable):
-                attrs['ends'].append(-1)
-                infer_flags[i] = -1
-            else:
-                attrs['ends'].append(dim)
-    else:
-        attrs['ends'] = slice_end
-
-    # strides
-    if use_strided_slice == True:
-        if contain_var(slice_step):
-            inputs['StridesTensorList'] = get_new_list_tensor(slice_step)
-            for i, dim in enumerate(slice_step):
-                if isinstance(dim, Variable):
-                    attrs['strides'].append(-1)
-                    infer_flags[i] = -1
-                else:
-                    attrs['strides'].append(dim)
-        else:
-            attrs['strides'] = slice_step
-    # infer_flags
-    attrs['infer_flags'] = infer_flags
-
-    out = var
-    if use_strided_slice == False and len(slice_axis) > 0:
-        # append slice_op here
-        slice_out_var = target_block.create_var(
-            name=unique_name.generate_with_ignorable_key(var.name + "_slice"),
-            dtype=var.dtype)
-
-        target_block.append_op(
-            type="slice",
-            inputs=inputs,
-            outputs={'Out': [slice_out_var]},
-            attrs=attrs)
-
-        out = slice_out_var
-    elif use_strided_slice == True and len(slice_axis) > 0:
-        strided_slice_out_var = target_block.create_var(
-            name=unique_name.generate_with_ignorable_key(var.name +
-                                                         "_strided_slice"),
-            dtype=var.dtype)
-        target_block.append_op(
-            type="strided_slice",
-            inputs=inputs,
-            outputs={'Out': [strided_slice_out_var]},
-            attrs=attrs)
-
-        out = strided_slice_out_var
-
-    if len(reverse_axis) > 0:
-        reverse_out_var = target_block.create_var(
-            name=unique_name.generate_with_ignorable_key(var.name +
-                                                         "_slice_reverse"),
-            dtype=var.dtype)
-        target_block.append_op(
-            type="reverse",
-            inputs={'X': out},
-            outputs={'Out': [reverse_out_var]},
-            attrs={'axis': reverse_axis})
-
-        out = reverse_out_var
-
-    return out
-
-
 @six.add_metaclass(VariableMetaClass)
 class Variable(object):
     """
@@ -1832,160 +1634,7 @@ def __getitem__(self, item):
         return _getitem_impl_(self, item)
 
     def __setitem__(self, item, value):
-        inputs = {'Input': self}
-
-        # 1. Parse item
-        if not isinstance(item, tuple):
-            item = [item]
-
-        decrease_axes = []
-        axes = []
-        starts = []
-        ends = []
-        steps = []
-
-        max_integer = sys.maxsize
-
-        def replace_ellipsis(item):
-            # Use slice(None) to replace Ellipsis.
-            # For var, var.shape = [3,4,5,6]
-            #
-            #   var[..., 1:2] -> var[:, :, :, 1:2]
-            #   var[0, ...] -> var[0]
-            #   var[0, ..., 1:2] -> var[0, :, :, 1:2]
-
-            item = list(item)
-
-            # Remove Variable to skip bug when counting Ellipsis
-            item_remove_var = [
-                ele for ele in item if not isinstance(ele, Variable)
-            ]
-            ell_count = item_remove_var.count(Ellipsis)
-            if ell_count == 0:
-                return item
-            elif ell_count > 1:
-                raise IndexError(
-                    "An index can only have a single ellipsis ('...')")
-
-            ell_idx = item.index(Ellipsis)
-
-            if ell_idx == len(item) - 1:
-                return item[:-1]
-            else:
-                item[ell_idx:ell_idx + 1] = [slice(None)] * (
-                    len(self.shape) - len(item) + 1)
-
-            return item
-
-        item = replace_ellipsis(item)
-
-        for dim, slice_item in enumerate(item):
-            if isinstance(slice_item, slice):
-                start = slice_item.start
-                end = slice_item.stop
-                step = slice_item.step
-
-                if start is None and end is None and step is None:
-                    continue
-
-                step = 1 if step is None else step
-
-                # TODO: support cases when step < 1
-                if not isinstance(step, Variable) and step == 0:
-                    raise ValueError(
-                        "When assign a value to a paddle.Tensor, step can not be 0, "
-                        "but received step is {}.".format(step))
-
-                if isinstance(step, Variable) and (start is None or
-                                                   end is None):
-                    raise ValueError(
-                        "When assign a value to a paddle.Tensor, it's not supported that "
-                        "the start or end is None when the type of step is paddle.Tensor."
-                    )
-
-                if start is None:
-                    start = 0 if step > 0 else max_integer
-
-                if end is None:
-                    end = max_integer if step > 0 else (0 - max_integer)
-            else:
-                decrease_axes.append(dim)
-                start = slice_item
-                end = slice_item + 1 if slice_item != -1 else max_integer
-                step = 1
-
-            axes.append(dim)
-            starts.append(start)
-            ends.append(end)
-            steps.append(step)
-
-        attrs = {
-            'axes': axes,
-            'starts': starts,
-            'ends': ends,
-            'steps': steps,
-            'decrease_axes': decrease_axes
-        }
-
-        from .layers import utils
-        if utils._contain_var(starts):
-            inputs['StartsTensorList'] = utils._convert_to_tensor_list(starts)
-            del attrs['starts']
-        if utils._contain_var(ends):
-            inputs['EndsTensorList'] = utils._convert_to_tensor_list(ends)
-            del attrs['ends']
-        if utils._contain_var(steps):
-            inputs['StepsTensorList'] = utils._convert_to_tensor_list(steps)
-            del attrs['steps']
-
-        # 2. Parse value
-        dtype = self.dtype
-        attrs['dtype'] = dtype
-
-        from .data_feeder import convert_dtype
-        #  2.1 value is an integer of float
-        if isinstance(value, (int, float)):
-            value = np.array([value]).astype(convert_dtype(dtype))
-
-        #  2.2 value is a np.ndarray
-        if isinstance(value, np.ndarray):
-            shape = list(value.shape)
-            if dtype == core.VarDesc.VarType.BOOL:
-                value_name = "bool_values"
-                values = [bool(v) for v in value.flat]
-            elif dtype == core.VarDesc.VarType.FP32:
-                value_name = "fp32_values"
-                values = [float(v) for v in value.flat]
-            elif dtype == core.VarDesc.VarType.FP64:
-                value_name = "fp64_values"
-                values = [float(v) for v in value.flat]
-            elif dtype == core.VarDesc.VarType.INT32:
-                value_name = "int32_values"
-                values = [int(v) for v in value.flat]
-            elif dtype == core.VarDesc.VarType.INT64:
-                value_name = "int64_values"
-                values = [int(v) for v in value.flat]
-            else:
-                raise TypeError(
-                    "When assign a numpy.ndarray, integer or float to a paddle.Tensor, "
-                    "the data type of the paddle.Tensor must be bool, float32, int32 or int64, but "
-                    "received %s." % convert_dtype(dtype))
-            attrs[value_name] = values
-            attrs["shape"] = shape
-
-        elif isinstance(value, Variable):
-            inputs["ValueTensor"] = value
-        else:
-            raise TypeError(
-                "Only support to assign an integer, float, numpy.ndarray or "
-                "paddle.Tensor to a paddle.Tensor, but received {}".format(
-                    type(value)))
-
-        cur_block = default_main_program().current_block()
-        cur_block.append_op(
-            type="set_value", inputs=inputs, outputs={'Out': self}, attrs=attrs)
-
-        return self
+        return _setitem_impl_(self, item, value)
 
     def get_value(self, scope=None):
         """
@@ -3222,14 +2871,22 @@ def append_op(self, *args, **kwargs):
                                        if attrs else {},
                                        kwargs.get("stop_gradient", False))
         else:
+            from paddle.fluid.dygraph.base import param_guard
+
             op_desc = self.desc.append_op()
-            op = Operator(
-                block=self,
-                desc=op_desc,
-                type=kwargs.get("type", None),
-                inputs=kwargs.get("inputs", None),
-                outputs=kwargs.get("outputs", None),
-                attrs=kwargs.get("attrs", None))
+            # NOTE(Aurelius84): In case of @to_static, all VarBase(s) should
+            # be converted into Variable(s) with same name and block location.
+            # This is ONE and ONLY logic of type transformation of dy2static.
+            inputs = kwargs.get("inputs", None)
+            outputs = kwargs.get("outputs", None)
+            with param_guard(inputs), param_guard(outputs):
+                op = Operator(
+                    block=self,
+                    desc=op_desc,
+                    type=kwargs.get("type", None),
+                    inputs=inputs,
+                    outputs=outputs,
+                    attrs=kwargs.get("attrs", None))
 
             self.ops.append(op)
 
@@ -5855,6 +5512,13 @@ def __deepcopy__(self, memo):
         new_param.copy_(self, True)
         return new_param
 
+    def _copy_to(self, device, blocking):
+        print("in ParamBase copy_to func")
+        state = copy.deepcopy(self.__dict__)
+        new_param = ParamBase(self.shape, self.dtype, **state)
+        core.varbase_copy(self, new_param, device, blocking)
+        return new_param
+
     __repr__ = __str__
 
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
index 0853d05ef3bbe..6fdca1c77a13c 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
@@ -123,7 +123,7 @@ def add_sparse_table(self, table_id, strategy):
             support_accessor_class = [
                 'DownpourFeatureValueAccessor', 'DownpourCtrAccessor',
                 'DownpourSparseValueAccessor', 'DownpourCtrDoubleAccessor',
-                'DownpourUnitAccessor'
+                'DownpourUnitAccessor', 'DownpourDoubleUnitAccessor'
             ]
             if strategy.get('sparse_accessor_class') is not None:
                 accessor_class = strategy.get('sparse_accessor_class')
@@ -254,7 +254,7 @@ def add_sparse_table(self, table_id, strategy):
                 table2.param = 2
                 table2.converter = converter
                 table2.deconverter = deconverter
-            elif accessor_class == 'DownpourUnitAccessor':
+            elif accessor_class == 'DownpourUnitAccessor' or accessor_class == 'DownpourDoubleUnitAccessor':
                 self.add_sparse_table_common_config(table, strategy)
                 self.add_sparse_optimizer(table.accessor.embed_sgd_param,
                                           strategy, "embed_")
@@ -380,7 +380,7 @@ def add_data_norm_table(self, table_id, learning_rate, param_var, grad_var,
         table.accessor.fea_dim = fea_dim
 
     def add_sparse_optimizer(self, sgd, strategy, prefix):
-        optimizer_name = strategy.get(prefix + "sparse_optimizer", "adam")
+        optimizer_name = strategy.get(prefix + "sparse_optimizer", "adagrad")
         sgd.name = optimizer_name
         if optimizer_name == "naive":
             sgd.naive.learning_rate = \
@@ -394,6 +394,19 @@ def add_sparse_optimizer(self, sgd, strategy, prefix):
                 strategy.get(prefix + 'sparse_learning_rate', 0.05)
             sgd.adagrad.initial_range = \
                 strategy.get(prefix + 'sparse_initial_range', 1e-4)
+            if prefix == "embed_":
+                sgd.adagrad.initial_range = 0
+            sgd.adagrad.initial_g2sum = strategy.get(
+                prefix + 'sparse_initial_g2sum', 3)
+            bounds = strategy.get(prefix + 'sparse_weight_bounds', [-10, 10])
+            sgd.adagrad.weight_bounds.extend(bounds)
+        elif optimizer_name == "std_adagrad":
+            sgd.adagrad.learning_rate = \
+                strategy.get(prefix + 'sparse_learning_rate', 0.05)
+            sgd.adagrad.initial_range = \
+                strategy.get(prefix + 'sparse_initial_range', 1e-4)
+            if prefix == "embed_":
+                sgd.adagrad.initial_range = 0
             sgd.adagrad.initial_g2sum = strategy.get(
                 prefix + 'sparse_initial_g2sum', 3)
             bounds = strategy.get(prefix + 'sparse_weight_bounds', [-10, 10])
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index f83dfd6a4eb14..884afb97e8f75 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -489,6 +489,7 @@ def _minimize(self,
                 # user do not have to set it in config_fleet
                 if accessor == "DownpourFeatureValueAccessor" \
                         or accessor == "DownpourCtrAccessor" \
+                        or accessor == "DownpourDoubleUnitAccessor" \
                         or accessor == "DownpourUnitAccessor":
                     if st.get("sparse_embedx_dim") is not None \
                             and st["sparse_embedx_dim"] != emb_to_size[key] - 3:
@@ -769,7 +770,7 @@ def _minimize(self,
         if server._server.downpour_server_param.downpour_table_param[
                 0].accessor.accessor_class in [
                     "DownpourCtrAccessor", "DownpourCtrDoubleAccessor",
-                    "DownpourUnitAccessor"
+                    "DownpourUnitAccessor", "DownpourDoubleUnitAccessor"
                 ]:
             opt_info["dump_slot"] = True
         elif server._server.downpour_server_param.downpour_table_param[
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 30baa2aa26cda..30a0b4053e6ff 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1788,7 +1788,7 @@ def get_tensor(var):
 
 
 @static_only
-def save(program, model_path, protocol=2, **configs):
+def save(program, model_path, protocol=4, **configs):
     """
     :api_attr: Static Graph
 
@@ -1802,7 +1802,7 @@ def save(program, model_path, protocol=2, **configs):
         program(Program) : The program to saved.
         model_path(str): the file prefix to save the program. The format is "dirname/file_prefix". If file_prefix is empty str. A exception will be raised
         protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
-                                 Default: 2
+                                 Default: 4
         configs(dict, optional) : optional keyword arguments.                        
 
     Returns:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index aa021c463bf3d..f87485c6a8f22 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -14772,7 +14772,7 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
     the size of the last shard will be less than the calculated `shard_size`
 
     Args:
-        input (Tensor): Input indices with data type int64. It's last dimension must be 1.
+        input (Tensor): Input indices with data type int64 or int32. It's last dimension must be 1.
         index_num (int): An integer defining the range of the index.
         nshards (int): The number of shards.
         shard_id (int): The index of the current shard.
@@ -14793,7 +14793,7 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
             print(shard_label)
             # [[-1], [1]]
     """
-    check_variable_and_dtype(input, 'input', ['int64'], 'shard_index')
+    check_variable_and_dtype(input, 'input', ['int64', 'int32'], 'shard_index')
     op_type = 'shard_index'
     helper = LayerHelper(op_type, **locals())
     if shard_id < 0 or shard_id >= nshards:
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index c0c07f593a3ed..987918493d3b4 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -580,8 +580,12 @@ def assign(input, output=None):
         input = numpy.array([input])
     elif isinstance(input, (list, tuple)):
         input = numpy.array(input)
-
-    if isinstance(input, Variable):
+    # NOTE(Aurelius84): Why we judge core.VarBase?
+    # In case of @to_static, a VarBase can be as input of `assign`,
+    # but in_dygraph_mode()==False under @to_static, which means
+    # isinstance(VarBase, Variable) == False. It will cause return None
+    # after this api.
+    if isinstance(input, (Variable, core.VarBase)):
         check_dtype(input.dtype, 'input', [
             'float16', 'uint16', 'float32', 'float64', 'int32', 'int64', 'bool'
         ], 'assign', '(When the type of input in assign is Variable.)')
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 41b2843ea33e9..c0b93c83f78e1 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -125,6 +125,8 @@ def __init__(self,
         # to train. These variables are called accumulators.
         # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
         self._accumulators = defaultdict(lambda: dict())
+        # global_accumulator dict, {accum_name : acc_variable, ...}
+        self._global_accumulators = {}
         self.helper = None
         self._opti_name_list = []
         self._accumulators_holder = {}
@@ -157,6 +159,8 @@ def state_dict(self):
         for k, v in self._accumulators.items():
             for para_name, var_tmp in v.items():
                 state_dict[var_tmp.name] = var_tmp
+        for k, v in self._global_accumulators.items():
+            state_dict[v.name] = v
         # global step if use lr decay
         if isinstance(self._learning_rate, LRScheduler):
             state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
@@ -236,36 +240,42 @@ def set_state_dict(self, state_dict):
                         "Type not supprt, value in state dict must be [VarBase, Variable, numpy], the type is ",
                         type(global_step))
 
+        def _load_state_para(state_dict, param):
+            var = param.value()
+            tensor = var.get_tensor()
+            model_np = np.array(tensor)
+            load_para = state_dict[param.name]
+            if isinstance(load_para, Variable):
+                load_para_np = load_para.numpy()
+            elif isinstance(load_para, core.VarBase):
+                load_para_np = load_para.numpy()
+            elif isinstance(load_para, np.ndarray):
+                load_para_np = load_para
+            else:
+                raise RuntimeError("State dict type {} not supprt".format(
+                    str(type(load_para))))
+
+            assert model_np.shape == load_para_np.shape,  \
+                                        "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
+                                                item.name, model_np.shape, load_para_np.shape)
+
+            assert model_np.dtype == load_para_np.dtype, \
+                                        "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
+                                            item.name, model_np.dtype, load_para_np.dtype)
+
+            tensor.set(load_para_np, framework._current_expected_place())
+
         self._accumulators_holder = state_dict
         for k, v in self._accumulators.items():
             for para_name, var_tmp in v.items():
                 assert var_tmp.name in state_dict, \
                         "optimizer variable {} not found".format( var_tmp.name )
-                var = var_tmp.value()
-                tensor = var.get_tensor()
-                model_np = np.array(tensor)
-
-                load_para = state_dict[var_tmp.name]
-
-                if isinstance(load_para, Variable):
-                    load_para_np = load_para.numpy()
-                elif isinstance(load_para, core.VarBase):
-                    load_para_np = load_para.numpy()
-                elif isinstance(load_para, np.ndarray):
-                    load_para_np = load_para
-                else:
-                    raise RuntimeError("State dict type {} not supprt".format(
-                        str(type(load_para))))
-
-                assert model_np.shape == load_para_np.shape,  \
-                                          "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
-                                                 item.name, model_np.shape, load_para_np.shape)
-
-                assert model_np.dtype == load_para_np.dtype, \
-                                          "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
-                                                item.name, model_np.dtype, load_para_np.dtype)
+                _load_state_para(state_dict, var_tmp)
 
-                tensor.set(load_para_np, framework._current_expected_place())
+        for k, v in self._global_accumulators.items():
+            assert v.name in state_dict, \
+                        "optimizer variable {} not found".format( v.name )
+            _load_state_para(state_dict, v)
 
     # [aliases] Compatible with old method names
     set_dict = set_state_dict
@@ -589,6 +599,60 @@ def _add_accumulator(self,
         self._accumulators[name][param.name] = var
         return var
 
+    def _add_global_accumulator(self,
+                                name,
+                                dtype=None,
+                                fill_value=0.0,
+                                shape=None,
+                                type=None,
+                                device=None):
+        """Utility function to add a global accumulator for all parameters in the model
+
+        Args:
+            block: the block in which the loss variable is present
+            name: name of the accumulator
+            dtype: data type of the accumulator variable
+            fill_value: value to initialize the accumulator variable
+            shape: the shape of the accumulator
+            type: the variable type of the accumulator
+            device: the target place of the accumulator
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        if (name in self._global_accumulators):
+            if framework.in_dygraph_mode():
+                return self._global_accumulators[name]
+            raise Exception("Global accumulator {} already exists".format(name))
+        if shape == None:
+            shape = [1]  # most case, global accumulator is of shape [1]
+        assert isinstance(self.helper, LayerHelper)
+
+        var_name = name
+        var_name = unique_name.generate(var_name)
+        self._opti_name_list.append(var_name)
+
+        var = self.helper.create_global_variable(
+            name=var_name,
+            persistable=True,
+            dtype=dtype if dtype else self._dtype,
+            type=type,
+            shape=shape,
+            belong_to_optimizer=True)
+        if device is None:
+            device = 'cpu'
+        with device_guard(device):
+            self.helper.set_variable_initializer(
+                var, initializer=Constant(value=float(fill_value)))
+
+        if framework.in_dygraph_mode():
+            if len(self._accumulators_holder) > 0:
+                assert var_name in self._accumulators_holder, \
+                        "Optimizer set error, {} should in state dict".format( var_name )
+                var.set_value(self._accumulators_holder[var_name])
+
+        self._global_accumulators[name] = var
+        return var
+
     def _get_accumulator(self, name, param):
         """Utility function to fetch an accumulator for a parameter
 
@@ -597,7 +661,7 @@ def _get_accumulator(self, name, param):
             param: parameter variable for which accumulator is to be fetched
 
         Returns:
-            accumulator variable for the parameter
+            accumulator variable
         """
         if self._name is not None:
             name = self._name + "_" + name
@@ -607,6 +671,21 @@ def _get_accumulator(self, name, param):
                             format(name, param.name))
         return self._accumulators[name][param.name]
 
+    def _get_global_accumulator(self, name):
+        """Utility function to fetch a global accumulator
+
+        Args:
+            name: name of the accumulator
+
+        Returns:
+            accumulator variable
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        if (name not in self._global_accumulators):
+            raise Exception("Global accumulator {} does not exist".format(name))
+        return self._global_accumulators[name]
+
     def _update_param_device_map(self, parameters_and_grads, target_block):
         for param_and_grad in parameters_and_grads:
             if param_and_grad[0].trainable is True:
@@ -1915,6 +1994,8 @@ class AdamOptimizer(Optimizer):
             gradient in current mini-batch, so it will be much more faster. But this mode has
             different semantics with the original Adam algorithm and may lead to different result.
             The default value is False.
+        use_global_beta_pow (bool, optional): Whether to use global beta_pow. If true, Adam will use global beta_pow 
+            for whole model instead of creating beta_pow for each parameter. Default is false.
 
     Examples:
         .. code-block:: python
@@ -2024,7 +2105,8 @@ def __init__(self,
                  regularization=None,
                  grad_clip=None,
                  name=None,
-                 lazy_mode=False):
+                 lazy_mode=False,
+                 use_global_beta_pow=False):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
@@ -2040,6 +2122,7 @@ def __init__(self,
         self._beta2 = beta2
         self._epsilon = epsilon
         self._lazy_mode = lazy_mode
+        self._use_global_beta_pow = use_global_beta_pow
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
@@ -2048,16 +2131,30 @@ def _create_accumulators(self, block, parameters):
         for p in parameters:
             self._add_accumulator(self._moment1_acc_str, p)
             self._add_accumulator(self._moment2_acc_str, p)
-            self._add_accumulator(
+            if not self._use_global_beta_pow:
+                self._add_accumulator(
+                    name=self._beta1_pow_acc_str,
+                    param=p,
+                    fill_value=0.9 if isinstance(self._beta1, Variable) \
+                            else self._beta1,
+                    shape=[1],
+                    type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+                self._add_accumulator(
+                    name=self._beta2_pow_acc_str,
+                    param=p,
+                    fill_value=0.999 if isinstance(self._beta2, Variable) \
+                            else self._beta2,
+                    shape=[1],
+                    type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+        if self._use_global_beta_pow:
+            self._add_global_accumulator(
                 name=self._beta1_pow_acc_str,
-                param=p,
                 fill_value=0.9 if isinstance(self._beta1, Variable) \
                         else self._beta1,
                 shape=[1],
                 type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
-            self._add_accumulator(
+            self._add_global_accumulator(
                 name=self._beta2_pow_acc_str,
-                param=p,
                 fill_value=0.999 if isinstance(self._beta2, Variable) \
                         else self._beta2,
                 shape=[1],
@@ -2070,10 +2167,16 @@ def _append_optimize_op(self, block, param_and_grad):
                                         param_and_grad[0])
         moment2 = self._get_accumulator(self._moment2_acc_str,
                                         param_and_grad[0])
-        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                              param_and_grad[0])
-        beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
-                                              param_and_grad[0])
+        if self._use_global_beta_pow:
+            beta1_pow_acc = self._get_global_accumulator(
+                self._beta1_pow_acc_str)
+            beta2_pow_acc = self._get_global_accumulator(
+                self._beta2_pow_acc_str)
+        else:
+            beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                                  param_and_grad[0])
+            beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                                  param_and_grad[0])
         lr = self._create_param_lr(param_and_grad)
         # create the adam optimize op
 
@@ -2087,7 +2190,8 @@ def _append_optimize_op(self, block, param_and_grad):
                 beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
                 moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
                 'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
-                1000, 'beta1', _beta1, 'beta2', _beta2)
+                1000, 'beta1', _beta1, 'beta2', _beta2, 'use_global_beta_pow',
+                self._use_global_beta_pow)
 
             return None
 
@@ -2109,7 +2213,8 @@ def _append_optimize_op(self, block, param_and_grad):
         }
         attrs = {
             "lazy_mode": self._lazy_mode,
-            "min_row_size_to_use_multithread": 1000
+            "min_row_size_to_use_multithread": 1000,
+            'use_global_beta_pow': self._use_global_beta_pow
         }
 
         if isinstance(self._beta1, Variable):
@@ -2134,6 +2239,43 @@ def _append_optimize_op(self, block, param_and_grad):
 
         return adam_op
 
+    def _finish_update(self, block, parameters_and_grads):
+        r"""Update beta1_pow and beta2_pow accumulator
+        """
+        assert isinstance(block, framework.Block)
+        if self._use_global_beta_pow:
+            beta1_pow_acc = self._get_global_accumulator(
+                self._beta1_pow_acc_str)
+            beta2_pow_acc = self._get_global_accumulator(
+                self._beta2_pow_acc_str)
+
+            with block.program._optimized_guard([]):
+                inputs = {"X": beta1_pow_acc}
+                attrs = {}
+                if isinstance(self._beta1, Variable):
+                    inputs['ScaleTensor'] = self._beta1
+                else:
+                    attrs['scale'] = self._beta1
+                block.append_op(
+                    type="scale",
+                    inputs=inputs,
+                    outputs={"Out": beta1_pow_acc},
+                    attrs=attrs,
+                    stop_gradient=True)
+
+                inputs = {"X": beta2_pow_acc}
+                attrs = {}
+                if isinstance(self._beta2, Variable):
+                    inputs['ScaleTensor'] = self._beta2
+                else:
+                    attrs['scale'] = self._beta2
+                block.append_op(
+                    type="scale",
+                    inputs=inputs,
+                    outputs={"Out": beta2_pow_acc},
+                    attrs=attrs,
+                    stop_gradient=True)
+
 
 class AdamaxOptimizer(Optimizer):
     r"""
@@ -4116,7 +4258,7 @@ def _get_op_device_attr(self, op):
         device = op.attr(self._op_device_key) \
             if op.has_attr(self._op_device_key) else None
         if device:
-            assert device[0:3] == 'gpu' or dev_type == 'npu', "Now, only gpu and npu devices are " \
+            assert device[0:3] == 'gpu' or device[0:3] == 'npu', "Now, only gpu and npu devices are " \
                 "supported in pipeline parallemism."
         return device
 
@@ -4200,6 +4342,8 @@ def _add_op_device_attr_for_op(self, op, idx, block):
                     op.type == 'elementwise_div'):
                 device = "gpu:all"
             op._set_attr(self._op_device_key, device)
+        elif op.type == "alloc_float_status":
+            op._set_attr(self._op_device_key, "gpu:all")
         else:
             other_known_ops = [
                 'update_loss_scaling',
@@ -4207,6 +4351,7 @@ def _add_op_device_attr_for_op(self, op, idx, block):
                 'concat',
                 'sum',
                 'check_finite_and_unscale',
+                'alloc_float_status',
             ]
             assert op.type in other_known_ops, "For other ops without " \
                 "op_device set, they must be one of {}, but it " \
@@ -4272,8 +4417,9 @@ def _check_validation(self, block):
                             "{} has not been set.".format(op.type))
             if device == "gpu:all": continue
             dev_type = device.split(':')[0]
-            assert dev_type == "gpu", ("Now only gpu devices are supported "
-                                       "for pipeline parallelism.")
+            assert dev_type == "gpu" or dev_type == 'npu', (
+                "Now only gpu and npu devices are supported "
+                "for pipeline parallelism.")
             if not device in device_list:
                 device_list.append(device)
         return device_list
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 8e998459cd499..c4a256f0e193d 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -17,12 +17,14 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
 list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
+list(APPEND DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
 list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
 list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_layer)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
@@ -53,6 +55,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_2)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_3)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_init)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer)
@@ -177,7 +180,8 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel)
-    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_layer)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_parallel)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_tensor_parallel)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single)
@@ -556,13 +560,13 @@ if(WITH_DISTRIBUTE)
         set(dist_ut_port 20001)
         foreach(TEST_OP ${DIST_TEST_OPS})
             bash_test_modules(${TEST_OP} START_BASH dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
-            MATH(EXPR dist_ut_port "${dist_ut_port}+40")
+            MATH(EXPR dist_ut_port "${dist_ut_port}+35")
             if(dist_ut_port GREATER_EQUAL 22998)
                 message(FATAL_ERROR "available ports have been exhausted:${dist_ut_port}")
             endif()
         endforeach(TEST_OP)
         # solve it later.
-        # bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+        bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
         bash_test_modules(test_new_group START_BASH test_new_group.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
     endif(NOT APPLE)
 endif()
@@ -864,7 +868,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_pipeline_layer PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
     if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
         set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py
new file mode 100644
index 0000000000000..647c9e9672cf0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+import numpy as np
+
+
+class BufferLayers(paddle.nn.Layer):
+    def __init__(self, out_channel):
+        super(BufferLayers, self).__init__()
+        self.out_channel = out_channel
+
+    def forward(self, x):
+        mean = paddle.mean(x)
+        if mean < 0.:
+            x = x * self._mask()
+
+        out = x - mean
+        return out
+
+    def _mask(self):
+        return paddle.to_tensor(np.zeros([self.out_channel], 'float32'))
+
+
+class SequentialNet(paddle.nn.Layer):
+    def __init__(self, sub_layer, in_channel, out_channel):
+        super(SequentialNet, self).__init__()
+        self.layer = paddle.nn.Sequential(
+            ('l1', paddle.nn.Linear(in_channel, in_channel)),
+            ('l2', paddle.nn.Linear(in_channel, out_channel)),
+            ('l3', sub_layer(out_channel)))
+
+    def forward(self, x):
+        out = self.layer(x)
+        return out
+
+
+class TestSequential(unittest.TestCase):
+    def setUp(self):
+        paddle.set_device('cpu')
+        self.seed = 2021
+
+    def _init_seed(self):
+        paddle.seed(self.seed)
+        np.random.seed(self.seed)
+
+    def _run(self, to_static):
+        self._init_seed()
+        net = SequentialNet(BufferLayers, 10, 3)
+        if to_static:
+            net = paddle.jit.to_static(net)
+        x = paddle.rand([16, 10], 'float32')
+        out = net(x)
+        if to_static:
+            load_out = self._test_load(net, x)
+            self.assertTrue(
+                np.allclose(load_out, out),
+                msg='load_out is {}\st_out is {}'.format(load_out, out))
+
+        return out
+
+    def test_train(self):
+        paddle.jit.set_code_level(100)
+        dy_out = self._run(to_static=False)
+        st_out = self._run(to_static=True)
+        self.assertTrue(
+            np.allclose(dy_out, st_out),
+            msg='dygraph_res is {}\nstatic_res is {}'.format(dy_out, st_out))
+
+    def _test_load(self, net, x):
+        model_path = './sequential_net'
+        paddle.jit.save(net, model_path)
+        load_net = paddle.jit.load(model_path)
+        out = load_net(x)
+        return out
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
index 0243ef3a6ddae..8da4e200cfc36 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
@@ -16,6 +16,7 @@
 
 import unittest
 
+import paddle
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.jit import declarative
@@ -61,6 +62,30 @@ def test_list_append_in_for_loop(x, iter_num):
     return a[0]
 
 
+def test_list_append_in_for_subscript(x):
+    x = fluid.dygraph.to_variable(x)
+    iter_num = paddle.shape(x)[0]
+    a = []
+    for i in range(iter_num):
+        x = x + 1
+        a.append(x)
+    out = paddle.concat(a)
+    return out[0]
+
+
+def test_list_append_in_while_loop_subscript(x):
+    x = fluid.dygraph.to_variable(x)
+    iter_num = paddle.shape(x)[0]
+    a = []
+    i = 0
+    while i < iter_num:
+        x = x + 1
+        a.append(x)
+        i += 1
+    out = paddle.concat(a)
+    return out[0]
+
+
 def test_list_append_in_for_loop_with_concat(x, iter_num):
     x = fluid.dygraph.to_variable(x)
     a = []
@@ -261,5 +286,16 @@ def init_dygraph_func(self):
         self.all_dygraph_funcs = [test_list_append_in_for_loop_with_concat, ]
 
 
+class TestListInForLoopWithSubscript(TestListWithoutControlFlow):
+    def init_dygraph_func(self):
+        self.all_dygraph_funcs = [
+            test_list_append_in_for_subscript,
+            test_list_append_in_while_loop_subscript
+        ]
+
+    def init_data(self):
+        self.input = np.random.random((3, 4)).astype('float32')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_param_guard.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_param_guard.py
new file mode 100644
index 0000000000000..cd3c76412feac
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_param_guard.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import unittest
+
+from paddle.jit import to_static, ProgramTranslator
+
+
+class NetWithParameterList(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super(NetWithParameterList, self).__init__()
+        weight = self.create_parameter([in_size, out_size])
+        bias = self.create_parameter([out_size], is_bias=True)
+        self.params = paddle.nn.ParameterList([weight, bias])
+
+    @to_static
+    def forward(self, x):
+        out = paddle.matmul(x, self.params[0])
+        out = paddle.add(out, self.params[1])
+        out = paddle.tanh(out)
+        return out
+
+
+class NetWithParameterListIter(NetWithParameterList):
+    def __init__(self, in_size, out_size):
+        super(NetWithParameterListIter, self).__init__(in_size, out_size)
+
+    @to_static
+    def forward(self, x):
+        # NOTE: manually trigger `__iter__` logic.
+        params = list(self.params.__iter__())
+        out = paddle.matmul(x, params[0])
+        out = paddle.add(out, params[1])
+        out = paddle.tanh(out)
+        return out
+
+
+class TestParameterList(unittest.TestCase):
+    def setUp(self):
+        self.seed = 2021
+        self.iter_num = 5
+        self.prog_trans = ProgramTranslator()
+
+    def train(self, is_iter, to_static):
+        paddle.seed(self.seed)
+        np.random.seed(self.seed)
+        self.prog_trans.enable(to_static)
+        if is_iter:
+            net = NetWithParameterList(10, 3)
+        else:
+            net = NetWithParameterListIter(10, 3)
+        sgd = paddle.optimizer.SGD(0.1, parameters=net.parameters())
+
+        for batch_id in range(self.iter_num):
+            x = paddle.rand([4, 10], dtype='float32')
+            out = net(x)
+            loss = paddle.mean(out)
+            loss.backward()
+            sgd.step()
+            sgd.clear_grad()
+
+        return loss
+
+    def test_parameter_list(self):
+        static_loss = self.train(False, to_static=True)
+        dygraph_loss = self.train(False, to_static=False)
+        self.assertTrue(
+            np.allclose(dygraph_loss, static_loss),
+            msg='dygraph result is {}\nstatic result is {}'.format(dygraph_loss,
+                                                                   static_loss))
+
+    def test_parameter_list_iter(self):
+        static_loss = self.train(True, to_static=True)
+        dygraph_loss = self.train(True, to_static=False)
+        self.assertTrue(
+            np.allclose(dygraph_loss, static_loss),
+            msg='dygraph result is {}\nstatic result is {}'.format(dygraph_loss,
+                                                                   static_loss))
+
+
+class NetWithRawParamList(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super(NetWithRawParamList, self).__init__()
+        weight = self.add_parameter('w',
+                                    self.create_parameter([in_size, out_size]))
+        bias = self.add_parameter(
+            'b', self.create_parameter(
+                [out_size], is_bias=True))
+        self.params = [weight]
+        self.bias_dict = {'b': bias}
+
+    @to_static
+    def forward(self, x):
+        out = paddle.matmul(x, self.params[0])
+        out = paddle.add(out, self.bias_dict['b'])
+        out = paddle.tanh(out)
+        return out
+
+
+class TestRawParameterList(unittest.TestCase):
+    def setUp(self):
+        self.seed = 2021
+        self.iter_num = 5
+        self.prog_trans = ProgramTranslator()
+
+    def init_net(self):
+        self.net = NetWithRawParamList(10, 3)
+
+    def train(self, to_static):
+        paddle.seed(self.seed)
+        np.random.seed(self.seed)
+        self.prog_trans.enable(to_static)
+        self.init_net()
+
+        sgd = paddle.optimizer.SGD(0.1, parameters=self.net.parameters())
+
+        for batch_id in range(self.iter_num):
+            x = paddle.rand([4, 10], dtype='float32')
+            out = self.net(x)
+            loss = paddle.mean(out)
+            loss.backward()
+            sgd.step()
+            sgd.clear_grad()
+
+        return loss
+
+    def test_parameter_list(self):
+        static_loss = self.train(to_static=True)
+        dygraph_loss = self.train(to_static=False)
+        self.assertTrue(
+            np.allclose(dygraph_loss, static_loss),
+            msg='dygraph result is {}\nstatic result is {}'.format(dygraph_loss,
+                                                                   static_loss))
+
+
+class NetWithSubLayerParamList(paddle.nn.Layer):
+    def __init__(self, sub_layer):
+        super(NetWithSubLayerParamList, self).__init__()
+        self.sub_layer = sub_layer
+        self.params = [sub_layer.weight]
+        self.bias_dict = {'b': sub_layer.bias}
+
+    @to_static
+    def forward(self, x):
+        out = paddle.matmul(x, self.params[0])
+        out = paddle.add(out, self.bias_dict['b'])
+        out = paddle.tanh(out)
+        return out
+
+
+class TestSubLayerParameterList(TestRawParameterList):
+    def init_net(self):
+        fc = paddle.nn.Linear(10, 3)
+        self.net = NetWithSubLayerParamList(fc)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
index dfbef998a2f07..349d5f82dbf54 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
@@ -231,7 +231,7 @@ def test_parallel_embedding(self):
         # model_b
         check_group = dist.new_group(list(range(self.model_parallel_size)))
         integral_w = []
-        partial_w = model_a.embedding.embedding.weight.clone().detach()
+        partial_w = model_a.embedding.weight.clone().detach()
         paddle.distributed.all_gather(integral_w, partial_w, group=check_group)
         result_w = []
         for idx in range(len(integral_w)):
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
index 767bf5d57e74a..a9f251f3079ce 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
@@ -37,6 +37,7 @@ def set_random_seed(seed, dp_id, rank_id):
 inner_size = 8
 output_size = 2
 seq_length = 2
+batch_size = 4
 
 
 class SimpleMPNet(fluid.dygraph.Layer):
@@ -130,18 +131,6 @@ def forward(self, x):
         return x
 
 
-class TrainDataset(Dataset):
-    def __init__(self, length):
-        self.length = length
-
-    def __len__(self):
-        return self.length
-
-    def __getitem__(self, index):
-        np_input_data = np.random.randint(0, vocab_size, (seq_length, ))
-        return np_input_data
-
-
 class TestDistMPTraning(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
@@ -178,20 +167,6 @@ def build_model_optimizer(self):
         np_fc1 = np.random.random_sample((hidden_size, inner_size))
         np_fc2 = np.random.random_sample((inner_size, hidden_size))
 
-        train_data = TrainDataset(length=10000)
-
-        train_batch_sampler = paddle.io.DistributedBatchSampler(
-            train_data,
-            batch_size=4,
-            shuffle=False,
-            num_replicas=self.data_parallel_size,
-            rank=dp_id)
-        train_data_loader = DataLoader(
-            dataset=train_data,
-            batch_sampler=train_batch_sampler,
-            num_workers=0,
-            return_list=True)
-
         model_a = SimpleMPNet(vocab_size, hidden_size, inner_size, output_size,
                               np_fc1, np_fc2, mp_id)
         optimizer_a = self.build_optimizer(model_a)
@@ -202,16 +177,17 @@ def build_model_optimizer(self):
                               np_fc1, np_fc2)
         optimizer_b = self.build_optimizer(model_b)
 
-        return model_a, optimizer_a, model_b, optimizer_b, train_data_loader
+        return model_a, optimizer_a, model_b, optimizer_b
 
     def test_mp_model(self):
-        model_a, optimizer_a, model_b, optimizer_b, train_data_loader = self.build_model_optimizer(
+        model_a, optimizer_a, model_b, optimizer_b = self.build_model_optimizer(
         )
 
-        for step, batch in enumerate(train_data_loader):
-            if step > 5:
-                return
-
+        for _ in range(5):
+            np_data = np.random.randint(0, vocab_size, (
+                batch_size,
+                seq_length, ))
+            batch = paddle.to_tensor(np_data)
             loss_a = self.train_batch(batch, model_a, optimizer_a, True)
             loss_b = self.train_batch(batch, model_b, optimizer_b, False)
 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py
new file mode 100644
index 0000000000000..14d7e960f4a68
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import paddle
+import numpy as np
+import random
+import paddle
+import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+from hybrid_parallel_pp_layer import AlexNetPipeDesc, AlexNet
+
+
+def set_random_seed(seed, dp_id, rank_id):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed + dp_id)
+    paddle.seed(seed + dp_id)
+
+
+batch_size = 4
+micro_batch_size = 2
+
+
+class TestDistPPTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 1
+        self.data_parallel_size = 1
+        self.pipeline_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": self.pipeline_parallel_size,
+        }
+        strategy.pipeline_configs = {
+            "accumulate_steps": batch_size // micro_batch_size,
+            "micro_batch_size": micro_batch_size
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_pp_model(self):
+        hcg = fleet.get_hybrid_communicate_group()
+        word_size = hcg.get_model_parallel_world_size()
+        dp_id = hcg.get_data_parallel_rank()
+        pp_id = hcg.get_stage_id()
+        rank_id = dist.get_rank()
+        set_random_seed(1024, dp_id, rank_id)
+
+        #construct model a
+        model_a = AlexNet(10)
+        scheduler_a = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a,
+                                           parameters=model_a.parameters())
+
+        param_len = len(model_a.parameters())
+
+        parameters = []
+        for param in model_a.parameters():
+            parameters.append(param.numpy())
+
+        # construct model b
+        model_b = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size)
+        scheduler_b = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b,
+                                           parameters=model_b.parameters())
+        model_b = fleet.distributed_model(model_b)
+        optimizer_b = fleet.distributed_optimizer(optimizer_b)
+
+        for idx, param in enumerate(model_b.parameters()):
+            param.set_value(parameters[idx + pp_id * (param_len // 2)])
+
+        # construct reader
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size, drop_last=True)
+
+        for step_id, data in enumerate(train_reader()):
+            x_data = np.array([x[0] for x in data]).astype('float32').reshape(
+                batch_size, 1, 28, 28)
+            y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                batch_size, 1)
+            img = paddle.to_tensor(x_data)
+            label = paddle.to_tensor(y_data)
+            img.stop_gradient = True
+            label.stop_gradient = True
+
+            if step_id >= 5:
+                return True
+
+            loss_a = model_a(img, label)
+            loss_a.backward()
+            optimizer_a.step()
+            optimizer_a.clear_grad()
+            scheduler_a.step()
+
+            loss_b = model_b.train_batch([img, label], optimizer_b, scheduler_b)
+
+            print("loss: ", loss_a.numpy(), loss_b.numpy())
+            np.testing.assert_allclose(
+                loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_embedding.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_embedding.py
new file mode 100644
index 0000000000000..d2be0cb80722b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_embedding.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import paddle
+import numpy as np
+import random
+import paddle
+import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+from paddle.fluid.dygraph.container import Sequential
+from paddle.distributed.fleet.meta_parallel import PipelineLayer
+from paddle.fluid.dygraph.layers import Layer
+import paddle.nn as nn
+import paddle.fluid as fluid
+
+
+def set_random_seed(seed, dp_id, rank_id):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed + dp_id)
+    paddle.seed(seed + dp_id)
+
+
+batch_size = 16
+micro_batch_size = 4
+vocab_size = 128
+hidden_size = 8
+
+
+class SimpleNet(Layer):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
+
+        self.softmax_weight = self.create_parameter(
+            shape=[hidden_size, vocab_size])
+        self.softmax_bias = self.create_parameter(
+            shape=[vocab_size], is_bias=False)
+
+    def forward(self, x1, x2, y1):
+        x_emb = self.word_embeddings(x1)
+        fc = fluid.layers.matmul(x_emb, self.softmax_weight)
+        fc = fluid.layers.elementwise_add(fc, self.softmax_bias)
+        projection = fluid.layers.reshape(fc, shape=[-1, vocab_size])
+        loss = fluid.layers.softmax_with_cross_entropy(
+            logits=projection, label=y1, soft_label=False)
+        return loss.mean()
+
+
+class EmbeddingNet(Layer):
+    def __init__(self):
+        super(EmbeddingNet, self).__init__()
+        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
+
+    def forward(self, args):
+        x1, x2 = args
+        x_emb = self.word_embeddings(x1)
+        return x_emb, x2
+
+
+class MatmulNet(Layer):
+    def __init__(self):
+        super(MatmulNet, self).__init__()
+        self.softmax_weight = self.create_parameter(
+            shape=[hidden_size, vocab_size])
+
+    def forward(self, args):
+        x1, x2 = args
+        fc = fluid.layers.matmul(x1, self.softmax_weight)
+
+        return fc, x2
+
+
+class BiasNet(Layer):
+    def __init__(self):
+        super(BiasNet, self).__init__()
+        self.softmax_bias = self.create_parameter(shape=[vocab_size])
+
+    def forward(self, args):
+        fc, x2 = args
+        fc = fluid.layers.elementwise_add(fc, self.softmax_bias)
+        projection = fluid.layers.reshape(fc, shape=[-1, vocab_size])
+        return projection, x2
+
+
+class LossNet(Layer):
+    def __init__(self):
+        super(LossNet, self).__init__()
+
+    def forward(self, args, y1):
+        projection, x2 = args
+        loss = fluid.layers.softmax_with_cross_entropy(
+            logits=projection, label=y1[0], soft_label=False)
+        return loss.mean()
+
+
+class SimpleNetPipe(Layer):
+    def __init__(self):
+        super(SimpleNetPipe, self).__init__()
+        self.features = Sequential(EmbeddingNet(), MatmulNet(), BiasNet())
+
+    def to_layers(self):
+        feat = [self.features[i] for i in range(len(self.features))]
+        return feat
+
+
+class TestDistEmbeddingTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 1
+        self.data_parallel_size = 1
+        self.pipeline_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": self.pipeline_parallel_size,
+        }
+        strategy.pipeline_configs = {
+            "accumulate_steps": batch_size // micro_batch_size,
+            "micro_batch_size": micro_batch_size
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_pp_model(self):
+        hcg = fleet.get_hybrid_communicate_group()
+        word_size = hcg.get_model_parallel_world_size()
+        dp_id = hcg.get_data_parallel_rank()
+        pp_id = hcg.get_stage_id()
+        rank_id = dist.get_rank()
+        set_random_seed(1024, dp_id, rank_id)
+
+        #construct model a
+        model_a = SimpleNet()
+        scheduler_a = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04], verbose=True)
+        optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a,
+                                           parameters=model_a.parameters())
+
+        init_net = SimpleNetPipe()
+        model_b = PipelineLayer(
+            layers=init_net.to_layers(),
+            num_stages=self.pipeline_parallel_size,
+            loss_fn=LossNet())
+
+        scheduler_b = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04], verbose=True)
+        optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b,
+                                           parameters=model_b.parameters())
+        model_b = fleet.distributed_model(model_b)
+        optimizer_b = fleet.distributed_optimizer(optimizer_b)
+
+        param_len = len(model_a.parameters())
+
+        parameters = []
+        for param in model_a.parameters():
+            print(param.name, param.shape)
+            parameters.append(param.numpy())
+
+        model_b_params = model_b.parameters()
+        if pp_id == 0:
+            model_b_params[0].set_value(parameters[2])
+        else:
+            model_b_params[0].set_value(parameters[0])
+            model_b_params[1].set_value(parameters[1])
+
+        for step in range(5):
+            x1_data = np.random.randint(0, vocab_size, size=[batch_size, 1])
+            x2_data = np.random.randint(0, vocab_size, size=[batch_size, 1])
+            y1_data = np.random.randint(0, 10, size=[batch_size, 1])
+
+            x1 = paddle.to_tensor(x1_data)
+            x2 = paddle.to_tensor(x2_data)
+            y1 = paddle.to_tensor(y1_data)
+
+            x1.stop_gradient = True
+            x2.stop_gradient = True
+            y1.stop_gradient = True
+
+            loss_a = model_a(x1, x2, y1)
+            loss_a.backward()
+            optimizer_a.step()
+            optimizer_a.clear_grad()
+            scheduler_a.step()
+
+            loss_b = model_b.train_batch([(x1, x2), (y1, )], optimizer_b,
+                                         scheduler_b)
+
+            print("loss", loss_a.numpy(), loss_b.numpy())
+            np.testing.assert_allclose(loss_a.numpy(), loss_b.numpy())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
index 3130cbf458467..b30df0e9a2f21 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
@@ -12,17 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import unittest
 import numpy as np
 import os
 import paddle
 from paddle.distributed import fleet
-import copy
 from paddle.fluid.dygraph.container import Sequential
 import paddle.nn as nn
 from paddle.fluid.dygraph.layers import Layer
 from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
 import paddle.nn.functional as F
-import unittest
+
+
+class ReshapeHelp(Layer):
+    def __init__(self, shape):
+        super(ReshapeHelp, self).__init__()
+        self.shape = shape
+
+    def forward(self, x):
+        return x.reshape(shape=self.shape)
 
 
 class AlexNet(Layer):
@@ -30,7 +38,7 @@ def __init__(self, num_classes=10):
         super(AlexNet, self).__init__()
         self.features = Sequential(
             nn.Conv2D(
-                3, 64, kernel_size=11, stride=4, padding=5),
+                1, 64, kernel_size=11, stride=4, padding=5),
             nn.ReLU(),
             nn.MaxPool2D(
                 kernel_size=2, stride=2),
@@ -50,13 +58,14 @@ def __init__(self, num_classes=10):
             nn.ReLU(),
             nn.MaxPool2D(
                 kernel_size=2, stride=2), )
+
+        self.reshape_layer = ReshapeHelp(shape=[-1, 256])
         self.classifier = nn.Linear(256, num_classes)
         self.loss_fn = nn.loss.CrossEntropyLoss()
 
     def forward(self, x, y):
         x = self.features(x)
-        x.flatten()
-
+        x = self.reshape_layer(x)
         x = self.classifier(x)
         return self.loss_fn(x, y)
 
@@ -64,7 +73,7 @@ def forward(self, x, y):
 class AlexNetPipe(AlexNet):
     def to_layers(self):
         feat = [self.features[i] for i in range(len(self.features))]
-        loss_fn = [lambda x: x.flatten(), self.classifier]
+        loss_fn = [self.reshape_layer, self.classifier]
         feat.extend(loss_fn)
         return feat
 
@@ -74,7 +83,7 @@ def __init__(self, num_classes=10, **kwargs):
         self.num_classes = num_classes
         decs = [
             LayerDesc(
-                nn.Conv2D, 3, 64, kernel_size=11, stride=4, padding=5),
+                nn.Conv2D, 1, 64, kernel_size=11, stride=4, padding=5),
             LayerDesc(nn.ReLU),
             LayerDesc(
                 nn.MaxPool2D, kernel_size=2, stride=2),
@@ -94,7 +103,8 @@ def __init__(self, num_classes=10, **kwargs):
             F.relu,
             LayerDesc(
                 nn.MaxPool2D, kernel_size=2, stride=2),
-            lambda x: x.flatten(),
+            LayerDesc(
+                ReshapeHelp, shape=[-1, 256]),
             LayerDesc(nn.Linear, 256, self.num_classes),  # classifier
         ]
         super(AlexNetPipeDesc, self).__init__(
@@ -104,24 +114,24 @@ def __init__(self, num_classes=10, **kwargs):
 class TestPipeLayerAPI(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
-        self.model_parallel_size = 2
+        self.pipeline_parallel_size = 2
         strategy.hybrid_configs = {
             "dp_degree": 1,
             "mp_degree": 1,
-            "pp_degree": self.model_parallel_size
+            "pp_degree": self.pipeline_parallel_size
         }
         fleet.init(is_collective=True, strategy=strategy)
         self.hcg = fleet.get_hybrid_communicate_group()
 
     def test_pipelayer_desc(self):
-        pipe_model = AlexNetPipeDesc(num_stages=self.model_parallel_size)
+        pipe_model = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size)
         np.testing.assert_array_equal(len(pipe_model.parameters()), 6)
 
     def test_pipelayer_sequential(self):
         init_net = AlexNetPipe()
         pipe_model = PipelineLayer(
             layers=init_net.to_layers(),
-            num_stages=self.model_parallel_size,
+            num_stages=self.pipeline_parallel_size,
             loss_fn=nn.CrossEntropyLoss())
         stage_id = self.hcg.get_stage_id()
         init_parameters = init_net.parameters()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py
deleted file mode 100644
index 9b9283a1a9b6e..0000000000000
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import numpy as np
-import random
-import paddle.distributed as dist
-import paddle.fluid as fluid
-import paddle.distributed.fleet as fleet
-from paddle.io import DataLoader, Dataset
-import unittest
-
-
-def set_random_seed(seed, dp_id, rank_id):
-    """Set random seed for reproducability."""
-    random.seed(seed)
-    np.random.seed(seed + dp_id)
-    paddle.seed(seed + rank_id)
-
-
-HIDDEN_DIM = 32
-LAYERS = 8
-
-
-def sequential_model():
-    model = paddle.nn.Sequential(
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
-        paddle.nn.Linear(HIDDEN_DIM, 1), )
-    return model
-
-
-class TestDistPPTraning(unittest.TestCase):
-    def setUp(self):
-        strategy = fleet.DistributedStrategy()
-        self.model_parallel_size = 1
-        self.data_parallel_size = 1
-        self.pipeline_parallel_size = 2
-        strategy.hybrid_configs = {
-            "dp_degree": self.data_parallel_size,
-            "mp_degree": self.model_parallel_size,
-            "pp_degree": self.pipeline_parallel_size,
-        }
-        strategy.pipeline_configs = {"accumulate_steps": 2}
-        paddle.distributed.init_parallel_env()
-        fleet.init(is_collective=True, strategy=strategy)
-
-    def test_mp_model(self):
-        batch_input = paddle.randn(shape=(1, HIDDEN_DIM), dtype="float32")
-        pipe_model = sequential_model()
-        sgd = paddle.optimizer.SGD(learning_rate=0.0003, parameters=[])
-        pipe_model = paddle.distributed.fleet.distributed_model(pipe_model)
-
-        if pipe_model.stage_id == 0 or pipe_model.stage_id == 1:
-            pipe_input = batch_input.clone().detach()
-            pipe_input = paddle.cast(pipe_input, 'float32')
-
-            def data_gen():
-                gen = True
-                while gen:
-                    yield [pipe_input, 0]
-                    gen = False
-
-            loader = paddle.io.DataLoader.from_generator(capacity=5)
-            loader.set_batch_generator(data_gen)
-            data_iter = iter(loader)
-        else:
-            data_iter = None
-        return True
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
index 3daa50020bab2..cde2fa412d705 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
@@ -31,10 +31,7 @@ def setUp(self):
                                       size=128,
                                       num_flatten_dims=1,
                                       act="relu")
-            fc_out2 = fluid.layers.fc(input=fc_out1,
-                                      size=32,
-                                      num_flatten_dims=1)
-            out = fluid.layers.softmax(input=fc_out2)
+            out = fluid.layers.softmax(input=fc_out1)
 
         self.feeds = {
             "data": np.random.random((32, 128, 2, 2)).astype("float32")
@@ -55,6 +52,60 @@ def test_check_output(self):
             self.check_output_with_option(use_gpu[i])
 
 
+class FCFusePassTRTStaticDims4Cols1Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[32, 128, 32, 8], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=1,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((32, 128, 32, 8)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTStaticDims4Cols1Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTStaticDims4Cols2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[3, 24, 16, 16], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=32,
+                                      num_flatten_dims=2,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((3, 24, 16, 16)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTStaticDims4Cols2Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
 class FCFusePassTRTDynamicDims2Test(InferencePassTest):
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
index bdcdeee8dcb66..d895ac44d8931 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
@@ -262,7 +262,6 @@ def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(
                 name="data", shape=[-1, 3, 64, 64], dtype="float32")
-            fc_out = fluid.layers.fc(input=data, size=200)
             param_attr = fluid.ParamAttr(
                 name='instance_norm_w',
                 initializer=fluid.initializer.Constant(value=1.0))
@@ -270,7 +269,7 @@ def setUp(self):
                 name='instance_norm_b',
                 initializer=fluid.initializer.Constant(value=0.0))
             out = fluid.layers.instance_norm(
-                input=fc_out, param_attr=param_attr, bias_attr=bias_attr)
+                input=data, param_attr=param_attr, bias_attr=bias_attr)
         self.feeds = {
             "data": np.random.random([1, 3, 64, 64]).astype("float32"),
         }
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
index ba7c8abc56daa..088b4fb59057b 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
@@ -63,4 +63,6 @@ def init_test_case(self):
 
 
 if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_bf16_mkldnn_op.py
index 149002fc76508..dba63be27b438 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_bf16_mkldnn_op.py
@@ -26,22 +26,23 @@
                  "place does not support BF16 evaluation")
 class TestMatmulBf16MklDNNOp(OpTest):
     def generate_data(self):
-        self.x = np.random.random((25, 2, 2)).astype(np.float32)
-        self.y = np.random.random((25, 2, 2)).astype(np.float32)
-        self.alpha = 1.0
-        self.out = self.alpha * np.matmul(self.x, self.y)
+        self.x_fp32 = np.random.random((25, 2, 2)).astype(np.float32)
+        self.y_fp32 = np.random.random((25, 2, 2)).astype(np.float32)
+        self.out = self.alpha * np.matmul(self.x_fp32, self.y_fp32)
 
     def set_attributes(self):
-        self.alpha = self.alpha if hasattr(self, 'alpha') else 1.0
         self.attrs = {
             'alpha': self.alpha,
             "use_mkldnn": self.use_mkldnn,
             "mkldnn_data_type": self.mkldnn_data_type,
-            "force_fp32_output": self.force_fp32_output
+            "force_fp32_output": self.force_fp32_output,
+            'transpose_X': False,
+            'transpose_Y': False
         }
 
     def setUp(self):
         self.op_type = "matmul"
+        self.alpha = 1.0
         self.use_mkldnn = True
         self.dtype = np.uint16
         self.mkldnn_data_type = "bfloat16"
@@ -53,67 +54,113 @@ def setUp(self):
             self.out = convert_float_to_uint16(self.out)
         self.outputs = {'Out': self.out}
 
-        self.x = convert_float_to_uint16(self.x)
-        self.y = convert_float_to_uint16(self.y)
-        self.inputs = {'X': self.x, 'Y': self.y}
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.y_bf16 = convert_float_to_uint16(self.y_fp32)
+        self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16}
 
     def test_check_output(self):
         self.check_output_with_place(core.CPUPlace())
 
     def test_check_grad(self):
-        pass
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X", "Y"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[self.dx, self.dy],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.dout)])
+
+    def matmul_grad(self, x, transpose_x, y, transpose_y):
+        x_transpose_axes = [1, 0] if x.ndim == 2 else [0, 2, 1]
+        y_transpose_axes = [1, 0] if y.ndim == 2 else [0, 2, 1]
+
+        x = np.transpose(x, x_transpose_axes) if transpose_x else x
+        y = np.transpose(y, y_transpose_axes) if transpose_y else y
+
+        return self.alpha * np.matmul(x, y)
+
+    def calculate_grads(self):
+        x_transpose_axes = [1, 0] if self.x_fp32.ndim == 2 else [0, 2, 1]
+        y_transpose_axes = [1, 0] if self.y_fp32.ndim == 2 else [0, 2, 1]
+
+        x = np.transpose(self.x_fp32, x_transpose_axes) if self.attrs[
+            'transpose_X'] is True else self.x_fp32
+        y = np.transpose(self.y_fp32, y_transpose_axes) if self.attrs[
+            'transpose_Y'] is True else self.y_fp32
+
+        dout = self.alpha * np.matmul(x, y)
+
+        if self.attrs['transpose_X'] is True and self.attrs[
+                'transpose_Y'] is True:
+            self.dx = self.matmul_grad(self.y_fp32, True, dout, True)
+            self.dy = self.matmul_grad(dout, True, self.x_fp32, True)
+        elif self.attrs['transpose_X'] is True and self.attrs[
+                'transpose_Y'] is False:
+            self.dx = self.matmul_grad(self.y_fp32, False, dout, True)
+            self.dy = self.matmul_grad(self.x_fp32, False, dout, False)
+        elif self.attrs['transpose_X'] is False and self.attrs[
+                'transpose_Y'] is True:
+            self.dx = self.matmul_grad(dout, False, self.y_fp32, False)
+            self.dy = self.matmul_grad(dout, True, self.x_fp32, False)
+        else:
+            self.dx = self.matmul_grad(dout, False, self.y_fp32, True)
+            self.dy = self.matmul_grad(self.x_fp32, True, dout, False)
+
+        self.dout = dout
 
 
 class TestDnnlMatMulOpAlpha(TestMatmulBf16MklDNNOp):
     def generate_data(self):
-        self.x = np.random.random((17, 2, 3)).astype(np.float32)
-        self.y = np.random.random((17, 3, 2)).astype(np.float32)
+        self.x_fp32 = np.random.random((17, 2, 3)).astype(np.float32)
+        self.y_fp32 = np.random.random((17, 3, 2)).astype(np.float32)
         self.alpha = 2.0
-        self.out = self.alpha * np.matmul(self.x, self.y)
+        self.out = self.alpha * np.matmul(self.x_fp32, self.y_fp32)
 
 
 class TestDnnlMatMulOp2D(TestMatmulBf16MklDNNOp):
     def generate_data(self):
-        self.x = np.random.random((12, 9)).astype(np.float32)
-        self.y = np.random.random((9, 12)).astype(np.float32)
-        self.out = np.matmul(self.x, self.y)
+        self.x_fp32 = np.random.random((12, 9)).astype(np.float32)
+        self.y_fp32 = np.random.random((9, 12)).astype(np.float32)
+        self.out = np.matmul(self.x_fp32, self.y_fp32)
 
 
 class TestDnnlMatMulOpTransposeX(TestMatmulBf16MklDNNOp):
     def generate_data(self):
-        self.x = np.random.random((12, 9)).astype(np.float32)
-        self.y = np.random.random((12, 9)).astype(np.float32)
-        self.out = np.matmul(np.transpose(self.x), self.y)
+        self.x_fp32 = np.random.random((12, 9)).astype(np.float32)
+        self.y_fp32 = np.random.random((12, 9)).astype(np.float32)
+        self.out = np.matmul(np.transpose(self.x_fp32), self.y_fp32)
 
     def set_attributes(self):
         self.attrs = {
             "use_mkldnn": self.use_mkldnn,
             "mkldnn_data_type": self.mkldnn_data_type,
-            'transpose_X': True
+            'transpose_X': True,
+            'transpose_Y': False
         }
 
 
 class TestDnnlMatMulOpTransposeY(TestMatmulBf16MklDNNOp):
     def generate_data(self):
-        self.x = np.random.random((12, 9)).astype(np.float32)
-        self.y = np.random.random((12, 9)).astype(np.float32)
-        self.out = np.matmul(self.x, np.transpose(self.y))
+        self.x_fp32 = np.random.random((12, 9)).astype(np.float32)
+        self.y_fp32 = np.random.random((12, 9)).astype(np.float32)
+        self.out = np.matmul(self.x_fp32, np.transpose(self.y_fp32))
 
     def set_attributes(self):
         self.attrs = {
             "use_mkldnn": self.use_mkldnn,
             "mkldnn_data_type": self.mkldnn_data_type,
-            'transpose_Y': True
+            'transpose_Y': True,
+            'transpose_X': False
         }
 
 
 class TestMatmulBf16MklDNNForceFp32Output(TestMatmulBf16MklDNNOp):
     def generate_data(self):
-        self.x = np.random.random((12, 9)).astype(np.float32)
-        self.y = np.random.random((9, 12)).astype(np.float32)
+        self.x_fp32 = np.random.random((12, 9)).astype(np.float32)
+        self.y_fp32 = np.random.random((9, 12)).astype(np.float32)
         self.force_fp32_output = True
         self.alpha = 0.5
-        self.out = self.alpha * np.matmul(self.x, self.y)
+        self.out = self.alpha * np.matmul(self.x_fp32, self.y_fp32)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
index 2f557f0bf145e..724b9d9818dc4 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
@@ -19,7 +19,6 @@
 from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
 
 
-@skip_check_grad_ci(reason="DNNL's MatMul doesn't implemend grad kernel.")
 class TestDnnlMatMulOp(OpTest):
     def generate_data(self):
         self.x = np.random.random((25, 2, 2)).astype("float32")
@@ -48,21 +47,99 @@ def test_check_output(self):
         self.check_output()
 
 
-class TestDnnlMatMulOpMixedDims1(TestDnnlMatMulOp):
+class TestDnnlMatMulWithGradOp(TestDnnlMatMulOp):
+    def test_check_grad(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-2)
+
+
+class TestDnnlMatMulOpMixedDims1(TestDnnlMatMulWithGradOp):
     def generate_data(self):
         self.x = np.random.random((17, 2, 3)).astype("float32")
         self.y = np.random.random((3, 4)).astype("float32")
         self.out = np.matmul(self.x, self.y)
 
 
-class TestDnnlMatMulOpMixedDims2(TestDnnlMatMulOp):
+class TestDnnlMatMulOpMixedDimsYWiderTransposeY(TestDnnlMatMulWithGradOp):
+    def generate_data(self):
+        self.x = np.random.random((8, 2, 3)).astype("float32")
+        self.y = np.random.random((4, 3)).astype("float32")
+        self.out = np.matmul(self.x, np.transpose(self.y))
+
+    def set_attributes(self):
+        self.attrs = {'transpose_Y': True}
+
+
+class TestDnnlMatMulOpMixedDimsYWiderTransposeX(TestDnnlMatMulWithGradOp):
+    def generate_data(self):
+        self.x = np.random.random((8, 3, 2)).astype("float32")
+        self.y = np.random.random((3, 4)).astype("float32")
+        self.out = np.matmul(np.transpose(self.x, (0, 2, 1)), self.y)
+
+    def set_attributes(self):
+        self.attrs = {'transpose_X': True}
+
+
+class TestDnnlMatMulOpMixedDimsXWiderTransposeXY(TestDnnlMatMulWithGradOp):
+    def generate_data(self):
+        self.x = np.random.random((8, 3, 2)).astype("float32")
+        self.y = np.random.random((4, 3)).astype("float32")
+        self.out = np.matmul(
+            np.transpose(self.x, (0, 2, 1)), np.transpose(self.y))
+
+    def set_attributes(self):
+        self.attrs = {'transpose_X': True, 'transpose_Y': True}
+
+
+class TestDnnlMatMulOpMixedDimsYWiderTransposeXY(TestDnnlMatMulWithGradOp):
+    def generate_data(self):
+        self.x = np.random.random((3, 2)).astype("float32")
+        self.y = np.random.random((8, 4, 3)).astype("float32")
+        self.out = np.matmul(
+            np.transpose(self.x), np.transpose(self.y, (0, 2, 1)))
+
+    def set_attributes(self):
+        self.attrs = {'transpose_X': True, 'transpose_Y': True}
+
+
+class TestDnnlMatMulOpMixedDimsXWiderTransposeX(TestDnnlMatMulWithGradOp):
+    def generate_data(self):
+        self.x = np.random.random((5, 4)).astype("float32")
+        self.y = np.random.random((8, 5, 4)).astype("float32")
+        self.out = np.matmul(np.transpose(self.x), self.y)
+
+    def set_attributes(self):
+        self.attrs = {'transpose_X': True}
+
+
+class TestDnnlMatMulOpVectorMultiply(TestDnnlMatMulWithGradOp):
+    def generate_data(self):
+        self.x = np.random.random((5)).astype("float32")
+        self.y = np.random.random((5)).astype("float32")
+        self.out = np.matmul(self.x, self.y)
+
+
+class TestDnnlMatMulOpVectorMultiplyTranspose(TestDnnlMatMulWithGradOp):
+    def generate_data(self):
+        self.x = np.random.random((5)).astype("float32")
+        x_resized = np.copy(self.x)
+        x_resized = np.expand_dims(x_resized, 1)
+        self.y = np.random.random((6)).astype("float32")
+        y_resized = np.copy(self.y)
+        y_resized = np.expand_dims(y_resized, 0)
+        self.out = np.matmul(x_resized, y_resized)
+
+    def set_attributes(self):
+        self.attrs = {'transpose_Y': True, 'transpose_X': True}
+
+
+class TestDnnlMatMulOpMixedDims2(TestDnnlMatMulWithGradOp):
     def generate_data(self):
         self.x = np.random.random((2, 3)).astype("float32")
         self.y = np.random.random((17, 3, 4)).astype("float32")
         self.out = np.matmul(self.x, self.y)
 
 
-class TestDnnlMatMulOpAlpha(TestDnnlMatMulOp):
+class TestDnnlMatMulOpAlpha(TestDnnlMatMulWithGradOp):
     def generate_data(self):
         self.x = np.random.random((17, 2, 3)).astype("float32")
         self.y = np.random.random((17, 3, 2)).astype("float32")
@@ -70,18 +147,14 @@ def generate_data(self):
         self.out = self.alpha * np.matmul(self.x, self.y)
 
 
-class TestDnnlMatMulOp2D(TestDnnlMatMulOp):
-    def print_tensor(self, name, tensor):
-        print(name)
-        print(tensor)
-
+class TestDnnlMatMulOp2D(TestDnnlMatMulWithGradOp):
     def generate_data(self):
         self.x = np.random.random((12, 9)).astype("float32")
         self.y = np.random.random((9, 12)).astype("float32")
         self.out = np.matmul(self.x, self.y)
 
 
-class TestDnnlMatMulOpTransposeX(TestDnnlMatMulOp):
+class TestDnnlMatMulOpTransposeX(TestDnnlMatMulWithGradOp):
     def generate_data(self):
         self.x = np.random.random((12, 9)).astype("float32")
         self.y = np.random.random((12, 9)).astype("float32")
@@ -91,7 +164,7 @@ def set_attributes(self):
         self.attrs = {'transpose_X': True}
 
 
-class TestDnnlMatMulOpTransposeY(TestDnnlMatMulOp):
+class TestDnnlMatMulOpTransposeY(TestDnnlMatMulWithGradOp):
     def generate_data(self):
         self.x = np.random.random((12, 9)).astype("float32")
         self.y = np.random.random((12, 9)).astype("float32")
@@ -101,7 +174,7 @@ def set_attributes(self):
         self.attrs = {'transpose_Y': True}
 
 
-class TestDnnlMatMulOpTransposeY3D(TestDnnlMatMulOp):
+class TestDnnlMatMulOpTransposeY3D(TestDnnlMatMulWithGradOp):
     def generate_data(self):
         self.x = np.random.random((17, 3, 2)).astype("float32")
         self.y = np.random.random((17, 3, 2)).astype("float32")
@@ -480,4 +553,6 @@ def test_check_output(self):
 
 
 if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_scale_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_scale_bf16_mkldnn_op.py
new file mode 100644
index 0000000000000..8e9f989f06c10
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_scale_bf16_mkldnn_op.py
@@ -0,0 +1,122 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+@unittest.skipIf(core.is_compiled_with_cuda(),
+                 "core is compiled with CUDA which has no BF implementation")
+class TestScaleOpBF16(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.x_fp32 = np.random.random((10, 10)).astype(np.float32)
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.scale = -2.3
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'scale': self.scale, 'use_mkldnn': True, 'bias': 0.4}
+        self.use_mkldnn = True
+        self.outputs = {
+            'Out': (self.x_fp32 * self.attrs['scale']) + self.attrs['bias']
+        }
+
+    def calculate_grads(self):
+        bias = 0
+        if 'bias' in self.attrs:
+            bias = self.attrs['bias']
+
+        scale = self.scale
+        if 'ScaleTensor' in self.attrs:
+            scale = self.attrs['ScaleTensor']
+
+        self.out = (self.x_fp32 * scale) + bias
+        self.dx = (self.out * scale)
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+    def test_check_grad(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[self.dx],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.out)])
+
+
+class TestScaleOpBF16BiasNotAfterScale(TestScaleOpBF16):
+    def setUp(self):
+        self.op_type = "scale"
+        self.x_fp32 = np.random.random((10, 10)).astype(np.float32)
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.scale = 1.5
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {
+            'scale': self.scale,
+            'use_mkldnn': True,
+            'bias': 0.0,
+            'bias_after_scale': False
+        }
+        self.use_mkldnn = True
+        self.outputs = {
+            'Out': (self.x_fp32 + self.attrs['bias']) * self.attrs['scale']
+        }
+
+
+class TestScaleOpBF16ScaleTensor(TestScaleOpBF16):
+    def setUp(self):
+        self.op_type = "scale"
+        self.scale = -2.3
+        self.x_fp32 = np.random.random((10, 10)).astype(np.float32)
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.scale_tensor = np.array([self.scale]).astype(np.float32)
+        self.inputs = {
+            'X': self.x_bf16,
+            'ScaleTensor': convert_float_to_uint16(self.scale_tensor)
+        }
+        self.attrs = {'use_mkldnn': True}
+        self.outputs = {'Out': self.x_fp32 * self.scale}
+
+
+class TestScaleOpBF16ScaleTensorNotBiasAfterScale(TestScaleOpBF16):
+    def setUp(self):
+        self.op_type = "scale"
+        self.scale = 1.2
+        self.x_fp32 = np.random.random((9, 13)).astype(np.float32)
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.scale_tensor = np.array([self.scale]).astype(np.float32)
+        self.inputs = {
+            'X': self.x_bf16,
+            'ScaleTensor': convert_float_to_uint16(self.scale_tensor)
+        }
+        self.attrs = {
+            'bias': -1.1,
+            'bias_after_scale': False,
+            'use_mkldnn': True
+        }
+        self.outputs = {'Out': (self.x_fp32 + self.attrs['bias']) * self.scale}
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_scale_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_scale_mkldnn_op.py
new file mode 100644
index 0000000000000..528b55dcd873d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_scale_mkldnn_op.py
@@ -0,0 +1,104 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+
+class TestScaleOp(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.inputs = {'X': np.random.random((10, 10)).astype(np.float32)}
+        self.attrs = {'scale': -2.3, 'use_mkldnn': True, 'bias': 0.2}
+        self.use_mkldnn = True
+        self.outputs = {
+            'Out': (self.inputs['X'] * self.attrs['scale']) + self.attrs['bias']
+        }
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestScaleOpBiasNotAfterScale(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.inputs = {'X': np.random.random((10, 10)).astype(np.float32)}
+        self.attrs = {
+            'scale': 1.5,
+            'use_mkldnn': True,
+            'bias': 2.3,
+            'bias_after_scale': False
+        }
+        self.use_mkldnn = True
+        self.outputs = {
+            'Out': (self.inputs['X'] + self.attrs['bias']) * self.attrs['scale']
+        }
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestScaleOpScaleTensor(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.scale = -2.3
+        self.inputs = {
+            'X': np.random.random((10, 10)).astype(np.float32),
+            'ScaleTensor': np.array([self.scale]).astype(np.float32)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': self.inputs['X'] * self.scale}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestScaleOpScaleTensorNotBiasAfterScale(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.scale = -1.2
+        self.inputs = {
+            'X': np.random.random((10, 10)).astype(np.float32),
+            'ScaleTensor': np.array([self.scale]).astype(np.float32)
+        }
+        self.attrs = {'bias': -6.8, 'bias_after_scale': False}
+        self.outputs = {
+            'Out':
+            (self.inputs['X'] + self.attrs['bias']) * self.inputs['ScaleTensor']
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
index 9e2229cece75c..13c1883af6184 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
@@ -129,4 +129,6 @@ def test_check(self):
 
 
 if __name__ == '__main__':
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/new_group.py b/python/paddle/fluid/tests/unittests/new_group.py
index fb7beeee1df2e..c9c4acc3220c7 100644
--- a/python/paddle/fluid/tests/unittests/new_group.py
+++ b/python/paddle/fluid/tests/unittests/new_group.py
@@ -27,6 +27,7 @@ def __init__(self):
 
     def test_all(self):
         gp = paddle.distributed.new_group([0, 1])
+        print("gp info:", gp)
         print("test new group api ok")
 
         tmp = np.array([0, 0, 0])
diff --git a/python/paddle/fluid/tests/unittests/npu/collective_identity_op_npu.py b/python/paddle/fluid/tests/unittests/npu/collective_identity_op_npu.py
new file mode 100644
index 0000000000000..a85bd4fccc3a7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/collective_identity_op_npu.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base_npu import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveIdentity(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofgather",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_identity",
+                inputs={'X': tindata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id,
+                       'nranks': nranks})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveIdentity, "identity", 0)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
index b5175bdb19c7e..5aeca5abd9f83 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
@@ -35,21 +35,21 @@ def setUp(self):
         self.set_npu()
         self.init_dtype()
         np.random.seed(SEED)
-        pred = np.random.uniform(1, 2, [11, 1]).astype(self.dtype)
-        label = pred.copy()
-        accuracy = np.array([1]).astype(self.dtype)
-        correct = np.array([11 * 1]).astype(self.dtype)
-        total = np.array([11 * 1]).astype(self.dtype)
-
-        self.inputs = {
-            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
-            "Label": OpTest.np_dtype_to_fluid_dtype(label),
-            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
-        }
+        n = 8192
+        infer = np.random.random((n, 1)).astype(self.dtype)
+        indices = np.random.randint(0, 2, (n, 1)).astype('int64')
+        label = np.random.randint(0, 2, (n, 1)).astype('int64')
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
         self.outputs = {
-            "Accuracy": accuracy,
-            "Correct": correct,
-            "Total": total
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
         }
 
     def set_npu(self):
@@ -69,53 +69,70 @@ def setUp(self):
         self.set_npu()
         self.init_dtype()
         np.random.seed(SEED)
-        pred = np.random.uniform(1, 2, [11, 1]).astype(self.dtype)
-        label = np.random.uniform(4, 5, [11, 1]).astype(self.dtype)
-        accuracy = np.array([0]).astype(self.dtype)
-        correct = np.array([11 * 0]).astype(self.dtype)
-        total = np.array([11 * 1]).astype(self.dtype)
-
-        self.inputs = {
-            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
-            "Label": OpTest.np_dtype_to_fluid_dtype(label),
-            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
-        }
+        n = 8192
+        infer = np.random.random((n, 100)).astype(self.dtype)
+        indices = np.random.randint(0, 1000, (n, 100)).astype('int64')
+        label = np.random.randint(0, 1000, (n, 1)).astype('int64')
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
         self.outputs = {
-            "Accuracy": accuracy,
-            "Correct": correct,
-            "Total": total
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
         }
 
 
-class TestAccuracy3(TestAccuracy):
+class TestAccuracyType(TestAccuracy):
     def setUp(self):
         self.op_type = "accuracy"
         self.set_npu()
         self.init_dtype()
         np.random.seed(SEED)
-        a = np.random.randint(1, 2, [5, 1])
-        b = np.random.randint(0, 1, [5, 1])
-        pred = np.row_stack((a, b)).astype(self.dtype)
-        label = np.random.randint(1, 2, [10, 1]).astype(self.dtype)
-        accuracy = np.array([0.5]).astype(self.dtype)
-        correct = np.array([5]).astype(self.dtype)
-        total = np.array([10 * 1]).astype(self.dtype)
-
-        self.inputs = {
-            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
-            "Label": OpTest.np_dtype_to_fluid_dtype(label),
-            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
-        }
+        n = 8192
+        infer = np.random.random((n, 100)).astype(self.dtype)
+        indices = np.random.randint(0, 1000, (n, 100)).astype('int64')
+        label = np.random.randint(0, 1000, (n, 1)).astype('int32')
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
         self.outputs = {
-            "Accuracy": accuracy,
-            "Correct": correct,
-            "Total": total
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
         }
 
 
-class TestAccuracyInt(TestAccuracy):
-    def init_dtype(self):
-        self.dtype = np.int
+class TestAccuracyType2(TestAccuracy):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.set_npu()
+        self.init_dtype()
+        np.random.seed(SEED)
+        n = 8192
+        infer = np.random.random((n, 100)).astype(self.dtype)
+        indices = np.random.randint(0, 1000, (n, 100)).astype('int32')
+        label = np.random.randint(0, 1000, (n, 1)).astype('int64')
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
+        }
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
index ec616070b63ab..a3b4242f39d36 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
@@ -134,6 +134,65 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
 
 
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAdamOpWithGlobalBetaPow(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            'Beta1Tensor': np.array([beta1]).astype("float32"),
+            'Beta2Tensor': np.array([beta2]).astype("float32"),
+            'EpsilonTensor': np.array([epsilon]).astype("float32"),
+        }
+
+        attributes = {'epsilon': epsilon}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, attributes)
+
+        self.attrs = {'use_global_beta_pow': True}
+
+        # use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([]),
+            'Beta2PowOut': np.array([])
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
+
+
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestNet(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_c_identity_npu.py b/python/paddle/fluid/tests/unittests/npu/test_c_identity_npu.py
new file mode 100644
index 0000000000000..9ea52a88d9897
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_c_identity_npu.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+import os
+
+from test_collective_base_npu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestIdentityOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_identity(self, col_type="identity"):
+        dist_env = os.environ
+        self.check_with_place(
+            "collective_identity_op_npu.py", col_type, need_envs=dist_env)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py b/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py
new file mode 100644
index 0000000000000..ba2b6329a2564
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import unittest
+import time
+import argparse
+import os
+import six
+import sys
+import subprocess
+import traceback
+import functools
+import pickle
+from contextlib import closing
+from six import string_types
+import paddle.fluid as fluid
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+
+
+class TestCollectiveRunnerBase(object):
+    def get_model(self, train_prog, startup_prog):
+        raise NotImplementedError(
+            "get model should be implemented by child class.")
+
+    def wait_server_ready(self, endpoints):
+        assert not isinstance(endpoints, string_types)
+        while True:
+            all_ok = True
+            not_ready_endpoints = []
+            for ep in endpoints:
+                ip_port = ep.split(":")
+                with closing(
+                        socket.socket(socket.AF_INET,
+                                      socket.SOCK_STREAM)) as sock:
+                    sock.settimeout(2)
+                    result = sock.connect_ex((ip_port[0], int(ip_port[1])))
+                    if result != 0:
+                        all_ok = False
+                        not_ready_endpoints.append(ep)
+            if not all_ok:
+                sys.stderr.write("server not ready, wait 3 sec to retry...\n")
+                sys.stderr.write("not ready endpoints:" + str(
+                    not_ready_endpoints) + "\n")
+                sys.stderr.flush()
+                time.sleep(3)
+            else:
+                break
+
+#endpoints should be ["ip1:port1","ip2:port2"]
+
+    def initCommunicator(self, program, rank, nranks, wait_port,
+                         current_endpoint, endpoints):
+        other_endpoints = endpoints[:]
+        other_endpoints.remove(current_endpoint)
+        if rank == 0 and wait_port:
+            self.wait_server_ready(other_endpoints)
+        block = program.global_block()
+        hccl_id_var = block.create_var(
+            name=nameGen.generate('hccl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+        block.append_op(
+            type='c_gen_hccl_id',
+            inputs={},
+            outputs={'Out': hccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints
+            })
+        block.append_op(
+            type='c_comm_init_hccl',
+            inputs={'X': hccl_id_var},
+            outputs={},
+            attrs={
+                'rank': rank,
+                'ring_id': self.global_ring_id,
+                'device_id': int(os.getenv("FLAGS_selected_npus")),
+                'rank_ids': nranks
+            })
+
+    def run_trainer(self, args):
+        train_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        endpoints = args["endpoints"].split(",")
+        rank = args["trainerid"]
+        current_endpoint = args["currentendpoint"]
+        nranks = 2
+        self.initCommunicator(startup_prog, rank, nranks, True,
+                              current_endpoint, endpoints)
+        self.rank = rank
+        result = self.get_model(train_prog, startup_prog)
+        device_id = int(os.getenv("FLAGS_selected_npus", "0"))
+        place = fluid.NPUPlace(device_id)
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        np.random.seed(os.getpid())
+        indata = np.random.random((10, 1000))
+        out = exe.run(train_prog,
+                      feed={'tindata': indata},
+                      fetch_list=[result.name])
+        if six.PY2:
+            print(pickle.dumps(out))
+        else:
+            sys.stdout.buffer.write(pickle.dumps(out))
+
+
+def runtime_main(test_class, col_type, sub_type):
+    args = {}
+    model = test_class()
+    args["deviceid"] = os.getenv("FLAGS_selected_npus")
+    args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
+    args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
+    args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
+    args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    args["col_type"] = col_type
+    model.run_trainer(args)
+
+
+import paddle.compat as cpt
+import socket
+from contextlib import closing
+
+
+class TestDistBase(unittest.TestCase):
+    def setUp(self):
+        self._port_set = set()
+        self._trainers = 2
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
+        self._python_interp = sys.executable
+
+    def _find_free_port(self):
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def _run_cluster(self, model_file, envs):
+        worker_endpoints = self._ps_endpoints.split(",")
+        w0_ep, w1_ep = worker_endpoints
+        #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
+        env0 = {
+            "FLAGS_selected_npus": "0",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w0_ep,
+        }
+
+        env1 = {
+            "FLAGS_selected_npus": "1",
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w1_ep,
+        }
+        #update environment
+        env0.update(envs)
+        env1.update(envs)
+        tr_cmd = "%s %s"
+        tr0_cmd = tr_cmd % (self._python_interp, model_file)
+        tr1_cmd = tr_cmd % (self._python_interp, model_file)
+        tr0_pipe = open("/tmp/tr0_err.log", "wb")
+        tr1_pipe = open("/tmp/tr1_err.log", "wb")
+        #print(tr0_cmd) 
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0)
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1)
+
+        tr0_out, tr0_err = tr0_proc.communicate()
+        tr1_out, tr1_err = tr1_proc.communicate()
+        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        return pickle.loads(tr0_out), pickle.loads(
+            tr1_out), tr0_proc.pid, tr1_proc.pid
+
+    def check_with_place(self, model_file, col_type, need_envs={}):
+
+        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file, need_envs)
+        np.random.seed(pid0)
+        input1 = np.random.random((10, 1000))
+        np.random.seed(pid1)
+        input2 = np.random.random((10, 1000))
+        if col_type == "identity":
+            need_result1 = input1
+            need_result2 = input2
+            self.assertTrue(np.allclose(tr0_out, need_result1, rtol=0, atol=0))
+            self.assertTrue(np.allclose(tr1_out, need_result2, rtol=0, atol=0))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
index 400ddd9d4aab0..2463ddb7137ac 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -41,7 +41,7 @@ def setUp(self):
         vocab = 10
         dim = 20
         w = np.ones([vocab, dim]).astype(self.dtype)
-        x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int64)
+        x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int32)
         out = np.ones([bsz, seqlen, dim]).astype(self.dtype)
 
         self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
index 1060e67078f8d..cae3239229f44 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
@@ -71,8 +71,7 @@ def init(self):
         }
 
     def test_check_output(self):
-        self.check_output_with_place(
-            self.place, check_dygraph=False, no_check_set=['Out'])
+        self.check_output_with_place(self.place, check_dygraph=False)
 
 
 class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
@@ -103,9 +102,6 @@ def setUp(self):
             'OutBadSteps': self.zero_steps
         }
 
-    def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
-
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 25717b7967712..3524d1e553d1b 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -132,6 +132,8 @@ def product(dim):
         tensor_to_check_dtype = np.float16
         # set delta as np.float16, will automatic convert to float32, float64
         delta = np.array(delta).astype(np.float16)
+    elif tensor_to_check_dtype == core.VarDesc.VarType.BF16:
+        tensor_to_check_dtype = np.float32
     else:
         raise ValueError("Not supported data type " + str(
             tensor_to_check_dtype))
@@ -140,9 +142,10 @@ def get_output():
         sum = []
         op.run(scope, place)
         for output_name in output_names:
-            sum.append(
-                np.array(scope.find_var(output_name).get_tensor()).astype(
-                    tensor_to_check_dtype).mean())
+            output_numpy = np.array(scope.find_var(output_name).get_tensor())
+            if tensor_to_check._dtype() == core.VarDesc.VarType.BF16:
+                output_numpy = convert_uint16_to_float(output_numpy)
+            sum.append(output_numpy.astype(tensor_to_check_dtype).mean())
         return tensor_to_check_dtype(np.array(sum).sum() / len(output_names))
 
     gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
@@ -152,6 +155,11 @@ def __get_elem__(tensor, i):
             numpy_tensor = np.array(tensor).astype(np.float16)
             numpy_tensor = numpy_tensor.flatten()
             return numpy_tensor[i]
+        elif tensor_to_check._dtype() == core.VarDesc.VarType.BF16:
+            numpy_tensor = np.array(tensor).astype(np.uint16)
+            numpy_tensor = numpy_tensor.flatten()
+            return struct.unpack('<f', struct.pack('<I', numpy_tensor[i]
+                                                   << 16))[0]
         elif tensor_to_check_dtype == np.float32:
             return tensor._get_float_element(i)
         elif tensor_to_check_dtype == np.float64:
@@ -168,6 +176,13 @@ def __set_elem__(tensor, i, e):
             numpy_tensor[i] = e
             numpy_tensor = numpy_tensor.reshape(shape)
             tensor.set(numpy_tensor, place)
+        elif tensor_to_check._dtype() == core.VarDesc.VarType.BF16:
+            numpy_tensor = np.array(tensor).astype(np.uint16)
+            shape = numpy_tensor.shape
+            numpy_tensor = numpy_tensor.flatten()
+            numpy_tensor[i] = np.uint16(copy_bits_from_float_to_uint16(e))
+            numpy_tensor = numpy_tensor.reshape(shape)
+            tensor.set(numpy_tensor, place)
         elif tensor_to_check_dtype == np.float32:
             tensor._set_float_element(i, e)
         elif tensor_to_check_dtype == np.float64:
@@ -1087,6 +1102,7 @@ def check_output_with_place(self,
             dygraph_outs = self._calc_dygraph_output(
                 place, no_check_set=no_check_set)
         outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
+
         for out_name, out_dup in Operator.get_op_outputs(self.op_type):
             if out_name not in self.outputs:
                 continue
@@ -1177,6 +1193,11 @@ def find_actual(target_name, fetch_list):
                     actual_t = convert_uint16_to_float(actual_t)
                     atol = 0.03
 
+                # NOTE(zhiqiu): np.allclose([], [1.]) returns True
+                # see details: https://stackoverflow.com/questions/38331703/why-does-numpys-broadcasting-sometimes-allow-comparing-arrays-of-different-leng
+                if expect_t.size == 0:
+                    self.assertTrue(actual_t.size == 0)
+
                 self.assertTrue(
                     np.allclose(
                         actual_t, expect_t, atol=atol, equal_nan=equal_nan),
@@ -1347,6 +1368,8 @@ def _assert_is_close(self, numeric_grads, analytic_grads, names,
                 abs_a[abs_a < 1e-10] = 1e-3
                 abs_a[np.logical_and(abs_a > 1e-10, abs_a <= 1e-8)] *= 1e4
                 abs_a[np.logical_and(abs_a > 1e-8, abs_a <= 1e-6)] *= 1e2
+            elif self.is_bfloat16_op():
+                abs_a[abs_a < 1e-2] = 1
             else:
                 abs_a[abs_a < 1e-3] = 1
 
@@ -1494,6 +1517,13 @@ def check_grad_with_place(self,
             dygraph_grad = self._get_dygraph_grad(
                 inputs_to_check, place, output_names, user_defined_grad_outputs,
                 no_grad_set)
+            fp32_grads = []
+            for grad in dygraph_grad:
+                if grad.dtype == np.uint16:
+                    grad = convert_uint16_to_float(grad)
+                    max_relative_error = 0.03
+                fp32_grads.append(grad)
+            dygraph_grad = fp32_grads
             self._assert_is_close(numeric_grads, dygraph_grad, inputs_to_check,
                                   max_relative_error,
                                   "Gradient Check On %s" % str(place))
@@ -1538,6 +1568,21 @@ def _get_dygraph_grad(self,
                 outputs=outputs,
                 attrs=attrs_outputs if hasattr(self, "attrs") else None)
 
+            if self.dtype == np.uint16:
+                cast_inputs = self._find_var_in_dygraph(outputs,
+                                                        output_names[0])
+                cast_outputs = block.create_var(
+                    dtype="float32", shape=cast_inputs[0].shape)
+                cast_op = block.append_op(
+                    inputs={"X": cast_inputs},
+                    outputs={"Out": cast_outputs},
+                    type="cast",
+                    attrs={
+                        "in_dtype": core.VarDesc.VarType.BF16,
+                        "out_dtype": core.VarDesc.VarType.FP32
+                    })
+                outputs = {output_names[0]: cast_outputs}
+
             outputs_valid = {}
             for output_name in output_names:
                 outputs_valid[output_name] = self._find_var_in_dygraph(
@@ -1653,6 +1698,21 @@ def _get_gradient(self,
         feed_dict = self.feed_var(inputs, place)
 
         if user_defined_grad_outputs is None:
+            if self.dtype == np.uint16:
+                cast_inputs = list(map(block.var, output_names))
+                cast_outputs = block.create_var(
+                    dtype="float32", shape=cast_inputs[0].shape)
+                cast_op = block.append_op(
+                    inputs={"X": cast_inputs},
+                    outputs={"Out": cast_outputs},
+                    type="cast",
+                    attrs={
+                        "in_dtype": core.VarDesc.VarType.BF16,
+                        "out_dtype": core.VarDesc.VarType.FP32
+                    })
+                cast_op.desc.infer_var_type(block.desc)
+                cast_op.desc.infer_shape(block.desc)
+                output_names = [cast_outputs.name]
             loss = append_loss_ops(block, output_names)
             param_grad_list = append_backward(
                 loss=loss,
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
index 7002352240973..5c518976d1f36 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
@@ -74,8 +74,8 @@ def test_multiple_gpus(self):
         state_dict = model_a.state_dict()
         model_b.set_state_dict(state_dict)
 
-        model_a = paddle.DataParallel(model_a)
-        model_b = paddle.DataParallel(model_b)
+        model_a = paddle.DataParallel(model_a, find_unused_parameters=True)
+        model_b = paddle.DataParallel(model_b, find_unused_parameters=True)
 
         ones_input = paddle.ones(shape=(batch, in_dim))
         ones_input.stop_gradient = True
diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
index 278d7b27c5288..2719e28fea08b 100644
--- a/python/paddle/fluid/tests/unittests/spawn_runner_base.py
+++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
@@ -27,6 +27,7 @@
 class SpawnAssistTestArgs(object):
     update_method = "local"
     trainer_id = 0
+    find_unused_parameters = False
 
 
 class TestDistSpawnRunner(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 92465c3e28401..ef5ac46cede42 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -18,7 +18,7 @@
 import numpy as np
 from scipy.special import expit, erf
 
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
@@ -1103,12 +1103,19 @@ def setUp(self):
         self.init_dtype()
 
         np.random.seed(1024)
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.maximum(x, 0)
+        if self.dtype == np.uint16:
+            x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
+            # The same reason with TestAbs
+            x[np.abs(x) < 0.005] = 0.02
+            out = convert_float_to_uint16(np.maximum(x, 0))
+            self.inputs = {'X': convert_float_to_uint16(x)}
+        else:
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            # The same reason with TestAbs
+            x[np.abs(x) < 0.005] = 0.02
+            out = np.maximum(x, 0)
+            self.inputs = {'X': x}
 
-        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -2718,7 +2725,7 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestGelu)
 create_test_act_fp16_class(TestBRelu)
 create_test_act_fp16_class(TestRelu6)
-create_test_act_fp16_class(TestSoftRelu)
+create_test_act_fp16_class(TestSoftRelu, grad_atol=0.85)
 create_test_act_fp16_class(TestELU)
 create_test_act_fp16_class(TestReciprocal)
 create_test_act_fp16_class(TestLog)
@@ -2736,8 +2743,35 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestSoftsign)
 create_test_act_fp16_class(TestThresholdedRelu)
 create_test_act_fp16_class(TestHardSigmoid)
-create_test_act_fp16_class(TestSwish)
+create_test_act_fp16_class(TestSwish, grad_atol=0.85)
 create_test_act_fp16_class(TestHardSwish)
 
+
+def create_test_act_bf16_class(parent,
+                               atol=1e-2,
+                               grad_check=True,
+                               grad_atol=0.80):
+    @unittest.skipIf(not paddle.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestActBF16(parent):
+        def init_dtype(self):
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=atol)
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['X'], 'Out', max_relative_error=grad_atol)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "bf16")
+    TestActBF16.__name__ = cls_name
+    globals()[cls_name] = TestActBF16
+
+
+create_test_act_bf16_class(TestRelu)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index cb646ef0b9321..1e316c3383ea7 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -404,7 +404,7 @@ def test_check_output(self):
 
 class TestAdamOpBetaEpsilonVariable(OpTest):
     def setUp(self):
-        '''Test Adam Op with beta as Variable
+        '''Test Adam Op with beta/epsilon as Variable
         '''
         self.op_type = "adam"
         param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
@@ -450,6 +450,57 @@ def test_check_output(self):
         self.check_output()
 
 
+class TestAdamOpWithGlobalBetaPow(OpTest):
+    def setUp(self):
+        '''Test Adam Op with global_beta_pow
+        '''
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+        beta1 = 0.85
+        beta2 = 0.95
+
+        learning_rate = 0.001
+        epsilon = 1e-8
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            "Beta1Tensor": np.array([beta1]).astype("float32"),
+            "Beta2Tensor": np.array([beta2]).astype("float32"),
+            "EpsilonTensor": np.array([epsilon]).astype("float32"),
+        }
+
+        attributes = {'epsilon': epsilon}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, attributes)
+
+        self.attrs = {'use_global_beta_pow': True}
+
+        # use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([]),
+            'Beta2PowOut': np.array([])
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAdamOpV2(unittest.TestCase):
     def test_adam_op(self):
         place = fluid.CPUPlace()
@@ -493,6 +544,7 @@ def test_adam_op_dygraph(self):
         out.backward()
         adam.step()
         adam.clear_gradients()
+        paddle.enable_static()
 
     def test_adam_op_with_state_dict(self):
 
@@ -523,6 +575,7 @@ def test_adam_op_with_state_dict(self):
 
         params = adam.get_opti_var_name_list()
         assert (params is not None)
+        paddle.enable_static()
 
     def test_adam_with_grad_clip(self):
         paddle.disable_static()
@@ -536,6 +589,7 @@ def test_adam_with_grad_clip(self):
         out.backward()
         adam.step()
         adam.clear_gradients()
+        paddle.enable_static()
 
     def test_adam_op_with_set_lr(self):
         paddle.disable_static()
@@ -550,6 +604,7 @@ def test_adam_op_with_set_lr(self):
             lr_var = paddle.fluid.layers.create_global_var(
                 shape=[1], value=lr, dtype='float32')
             adam.set_lr(lr_var)
+        paddle.enable_static()
 
     def test_adam_op_invalid_input(self):
         paddle.disable_static()
@@ -563,6 +618,7 @@ def test_adam_op_invalid_input(self):
         with self.assertRaises(ValueError):
             adam = paddle.optimizer.Adam(
                 0.1, epsilon=-1, parameters=linear.parameters())
+        paddle.enable_static()
 
     def test_adam_op_with_sparse_input_and_weight_decay(self):
 
@@ -577,10 +633,15 @@ def test_adam_op_with_sparse_input_and_weight_decay(self):
             out = emb(x)
             out.backward()
             adam.step()
+        paddle.enable_static()
 
 
 class TestNetWithEpsilonTensor(unittest.TestCase):
-    def _test(self, place, use_tensor=True, use_fluid_api=True):
+    def _test(self,
+              place,
+              use_tensor=True,
+              use_fluid_api=True,
+              use_global_beta_pow=False):
         paddle.enable_static()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -633,7 +694,8 @@ def _test(self, place, use_tensor=True, use_fluid_api=True):
                         learning_rate=0.01,
                         beta1=beta1,
                         beta2=beta2,
-                        epsilon=epsilon)
+                        epsilon=epsilon,
+                        use_global_beta_pow=use_global_beta_pow)
                 else:
                     adam = paddle.optimizer.Adam(
                         learning_rate=0.01,
@@ -646,7 +708,9 @@ def _test(self, place, use_tensor=True, use_fluid_api=True):
                         learning_rate=0.01,
                         beta1=beta1_init,
                         beta2=beta2_init,
-                        epsilon=epsilon_init)
+                        epsilon=epsilon_init,
+                        use_global_beta_pow=use_global_beta_pow,
+                        name='a')
                 else:
                     adam = fluid.optimizer.Adam(
                         learning_rate=0.01,
@@ -680,9 +744,11 @@ def _test_with_place(self, place):
 
         for use_tensor in [True, False]:
             for use_fluid_api in [True, False]:
-                pred, loss = self._test(place, use_tensor, use_fluid_api)
-                preds.append(pred)
-                losses.append(loss)
+                for use_global_beta_pow in [True, False]:
+                    pred, loss = self._test(place, use_tensor, use_fluid_api,
+                                            use_global_beta_pow)
+                    preds.append(pred)
+                    losses.append(loss)
         for pred in preds:
             self.assertTrue(np.allclose(pred, preds[0]))
         for loss in losses:
@@ -694,6 +760,55 @@ def test_adam_api(self):
         if core.is_compiled_with_cuda():
             self._test_with_place(paddle.CUDAPlace(0))
 
+    def test_adam_exception(self):
+        paddle.enable_static()
+        a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+        b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+        label = paddle.static.data(name="label", shape=[32, 1], dtype='int64')
+
+        sum = paddle.add(a, b)
+        z = paddle.pow(sum, 2.0)
+
+        fc_1 = fluid.layers.fc(input=z, size=128)
+        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        loss = fluid.layers.reduce_mean(cost)
+        adam = fluid.optimizer.Adam(use_global_beta_pow=True)
+        adam.minimize(loss)
+        self.assertRaises(Exception, adam._get_global_accumulator, 'tmp')
+        adam._add_global_accumulator(
+            'tmp', type=core.VarDesc.VarType.LOD_TENSOR)
+        adam._get_global_accumulator('tmp')
+        self.assertRaises(
+            Exception,
+            adam._add_global_accumulator,
+            adam._beta1_pow_acc_str,
+            type=core.VarDesc.VarType.LOD_TENSOR)
+        paddle.disable_static()
+
+    def test_adam_save_load(self):
+        paddle.disable_static()
+        a = paddle.rand([4, 10])
+        linear = paddle.nn.Linear(10, 10)
+        b = linear(a)
+        state_dict = linear.state_dict()
+        fluid.save_dygraph(state_dict, "paddle_dy")
+
+        scheduler = paddle.optimizer.lr.NoamDecay(
+            d_model=0.01, warmup_steps=100, verbose=True)
+        adam = paddle.fluid.optimizer.Adam(
+            learning_rate=scheduler,
+            parameter_list=linear.parameters(),
+            use_global_beta_pow=True)
+        adam.minimize(b)
+        state_dict = adam.state_dict()
+        fluid.save_dygraph(state_dict, "paddle_dy")
+        para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
+        adam.set_state_dict(opti_state_dict)
+
+        paddle.enable_static()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index 27c8869b21d82..fb5b8bde10609 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -341,7 +341,7 @@ def setUp(self):
         self.linear.register_buffer("buf_name", buffer, persistable=True)
 
         sublayer = paddle.nn.Conv1D(3, 2, 3)
-        self.linear.add_sublayer(1, sublayer)
+        self.linear.add_sublayer("1", sublayer)
 
     def test_to_api(self):
         self.linear.to(dtype='double')
@@ -351,8 +351,8 @@ def test_to_api(self):
                          paddle.fluid.core.VarDesc.VarType.FP64)
         self.assertTrue(
             np.allclose(self.linear.weight.grad.numpy(), self.new_grad))
-        self.assertTrue(self.linear.weight._grad_ivar().dtype,
-                        paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertEqual(self.linear.weight._grad_ivar().dtype,
+                         paddle.fluid.core.VarDesc.VarType.FP64)
 
         self.linear.to()
         self.assertEqual(self.linear.weight.dtype,
@@ -361,8 +361,10 @@ def test_to_api(self):
                          paddle.fluid.core.VarDesc.VarType.FP64)
         self.assertTrue(
             np.allclose(self.linear.weight.grad.numpy(), self.new_grad))
-        self.assertTrue(self.linear.weight._grad_ivar().dtype,
-                        paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertEqual(self.linear.weight._grad_ivar().dtype,
+                         paddle.fluid.core.VarDesc.VarType.FP64)
+        for p in self.linear.parameters():
+            self.assertTrue(isinstance(p, paddle.fluid.framework.ParamBase))
 
         if paddle.fluid.is_compiled_with_cuda():
             self.linear.to(device=paddle.CUDAPlace(0))
@@ -384,6 +386,8 @@ def test_to_api(self):
             ))
             self.assertEqual(
                 self.linear.weight._grad_ivar().place.gpu_device_id(), 0)
+            for p in self.linear.parameters():
+                self.assertTrue(isinstance(p, paddle.fluid.framework.ParamBase))
 
         self.linear.to(device=paddle.CPUPlace())
         self.assertTrue(self.linear.weight.place.is_cpu_place())
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
index 5de1ebf581372..0b956d5031fec 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
@@ -73,6 +73,7 @@ def test_communicator_ps_gpu(self):
         dataset.init(
             batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars)
         dataset.set_filelist(["test_communicator_ps_gpu.txt"])
+        dataset._set_use_ps_gpu(1)
         dataset.load_into_memory()
 
         os.environ["TEST_MODE"] = "1"
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index 8dc80c8931269..a2dd7e49ac4cc 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -139,6 +139,22 @@ def test_broadcast_api_2(self):
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
+        def test_broadcast_api_3(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[5], dtype='int32')
+                y = paddle.static.data(name='y', shape=[3, 1], dtype='int32')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 5).reshape((5)).astype(np.int32)
+                input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(np.int32)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
         def test_attr_name(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_complex_cast.py b/python/paddle/fluid/tests/unittests/test_complex_cast.py
new file mode 100644
index 0000000000000..b4162be5b3691
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_complex_cast.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division
+
+import unittest
+import numpy as np
+
+import paddle
+
+
+class TestComplexCastOp(unittest.TestCase):
+    def test_complex_to_real(self):
+        r = np.random.random(size=[10, 10]) * 10
+        i = np.random.random(size=[10, 10])
+
+        c_t = paddle.to_tensor(r + i * 1J, dtype='complex64')
+
+        self.assertEqual(c_t.cast('int64').dtype, paddle.int64)
+        self.assertEqual(c_t.cast('int32').dtype, paddle.int32)
+        self.assertEqual(c_t.cast('float32').dtype, paddle.float32)
+        self.assertEqual(c_t.cast('float64').dtype, paddle.float64)
+        self.assertEqual(c_t.cast('bool').dtype, paddle.bool)
+
+        self.assertTrue(
+            np.allclose(c_t.cast('int64').numpy(), r.astype('int64')))
+        self.assertTrue(
+            np.allclose(c_t.cast('int32').numpy(), r.astype('int32')))
+        self.assertTrue(
+            np.allclose(c_t.cast('float32').numpy(), r.astype('float32')))
+        self.assertTrue(
+            np.allclose(c_t.cast('float64').numpy(), r.astype('float64')))
+        self.assertTrue(np.allclose(c_t.cast('bool').numpy(), r.astype('bool')))
+
+    def test_real_to_complex(self):
+        r = np.random.random(size=[10, 10]) * 10
+        r_t = paddle.to_tensor(r)
+
+        self.assertEqual(r_t.cast('complex64').dtype, paddle.complex64)
+        self.assertEqual(r_t.cast('complex128').dtype, paddle.complex128)
+
+        self.assertTrue(np.allclose(r_t.cast('complex64').real().numpy(), r))
+        self.assertTrue(np.allclose(r_t.cast('complex128').real().numpy(), r))
+
+    def test_complex64_complex128(self):
+        r = np.random.random(size=[10, 10])
+        i = np.random.random(size=[10, 10])
+
+        c = r + i * 1J
+        c_64 = paddle.to_tensor(c, dtype='complex64')
+        c_128 = paddle.to_tensor(c, dtype='complex128')
+
+        self.assertTrue(c_64.cast('complex128').dtype, paddle.complex128)
+        self.assertTrue(c_128.cast('complex128').dtype, paddle.complex64)
+        self.assertTrue(
+            np.allclose(c_64.cast('complex128').numpy(), c_128.numpy()))
+        self.assertTrue(
+            np.allclose(c_128.cast('complex128').numpy(), c_64.numpy()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cond.py b/python/paddle/fluid/tests/unittests/test_cond.py
index ad5420b92c092..0470a2df35f68 100644
--- a/python/paddle/fluid/tests/unittests/test_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_cond.py
@@ -96,7 +96,7 @@ def false_func():
         self.assertTrue(
             np.allclose(np.asarray(ret[0]), np.full((1, 2), 1, np.int32)))
         self.assertTrue(
-            np.allclose(np.asarray(ret[1]), np.full((2, 3), True, np.bool)))
+            np.allclose(np.asarray(ret[1]), np.full((2, 3), True, bool)))
 
     def test_pass_and_modify_var(self):
         """
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 897d76a35dcab..731e4b54e22c3 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -20,6 +20,7 @@
 import unittest
 from test_softmax_op import stable_softmax
 from test_softmax_with_cross_entropy_op import cross_entropy
+from paddle.fluid import Program, program_guard
 
 
 def stable_softmax(x):
@@ -1363,5 +1364,38 @@ def test_cross_entropy_loss_2d_sum(self):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
 
+class TestCrossEntropyFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_LabelValue():
+                input_data = paddle.rand(shape=[20, 100])
+                label_data = paddle.randint(
+                    0, 100, shape=[20, 1], dtype="int64")
+                label_data[0] = 255
+                weight_data = paddle.rand([100])
+                paddle.nn.functional.cross_entropy(
+                    input=input_data,
+                    label=label_data,
+                    weight=weight_data,
+                    ignore_index=255)
+
+            self.assertRaises(ValueError, test_LabelValue)
+
+            def test_LabelValueNeg():
+                input_data = paddle.rand(shape=[20, 100])
+                label_data = paddle.randint(
+                    0, 100, shape=[20, 1], dtype="int64")
+                label_data[0] = -1
+                weight_data = paddle.rand([100])
+                paddle.nn.functional.cross_entropy(
+                    input=input_data,
+                    label=label_data,
+                    weight=weight_data,
+                    ignore_index=-1)
+
+            self.assertRaises(ValueError, test_LabelValueNeg)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py b/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
index 5796e13336ccf..6e8ee5589db77 100644
--- a/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
@@ -77,7 +77,11 @@ def assertInputData(self, batch_id, input_data, dev_cnt):
     def get_places(self):
         place_list = [fluid.cpu_places(1), fluid.cpu_places(4)]
         if fluid.is_compiled_with_cuda():
-            place_list.extend([fluid.cuda_places(0), fluid.cuda_places([0, 1])])
+            if os.name == "nt":
+                place_list.extend([fluid.cuda_places(0)])
+            else:
+                place_list.extend(
+                    [fluid.cuda_places(0), fluid.cuda_places([0, 1])])
         return place_list
 
     def test_main(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py b/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py
index 89bbc88e01eaf..f779d762fb302 100644
--- a/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py
@@ -96,7 +96,11 @@ def assertInputData(self, batch_id, input_data, dev_cnt,
     def get_places(self):
         place_list = [fluid.cpu_places(1), fluid.cpu_places(4)]
         if fluid.is_compiled_with_cuda():
-            place_list.extend([fluid.cuda_places(0), fluid.cuda_places([0, 1])])
+            if os.name == "nt":
+                place_list.extend([fluid.cuda_places(0)])
+            else:
+                place_list.extend(
+                    [fluid.cuda_places(0), fluid.cuda_places([0, 1])])
         return place_list
 
     def test_main(self):
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
index a7c1b14d269f4..0be329ac959f0 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -19,9 +19,9 @@
 import six
 import unittest
 
-EPOCH_NUM = 20
-BATCH_SIZE = 32
-BATCH_NUM = 20
+EPOCH_NUM = 5
+BATCH_SIZE = 16
+BATCH_NUM = 10
 CLASS_NUM = 10
 
 
@@ -29,7 +29,7 @@ def random_reader():
     np.random.seed(1)
     for i in range(BATCH_SIZE * BATCH_NUM):
         image = np.random.random([784])
-        label = np.random.random_integers(low=0, high=CLASS_NUM - 1)
+        label = np.random.randint(low=0, high=CLASS_NUM)
         yield image, label
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 37494294418f1..edc510e4e766d 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -548,7 +548,10 @@ def run_trainer_with_spawn(self, args):
         # 4. train model
         model, train_reader, opt = self.get_model()
         if args.update_method == "nccl2":
-            model = paddle.DataParallel(model)
+            if args.find_unused_parameters:
+                model = paddle.DataParallel(model, find_unused_parameters=True)
+            else:
+                model = paddle.DataParallel(model, find_unused_parameters=False)
 
         out_losses = []
         for step_id, data in enumerate(train_reader()):
@@ -581,8 +584,8 @@ def run_use_fleet_api_trainer(self, args):
 
         # set strategy
         strategy = fleet.DistributedStrategy()
-        if not args.find_unused_parameters:
-            strategy.find_unused_parameters = False
+        if args.find_unused_parameters:
+            strategy.find_unused_parameters = True
 
         # 3. init parallel env
         if args.update_method == "nccl2" or "bkcl":
@@ -737,7 +740,7 @@ def setUp(self):
         self._save_model = False
         self._fuse_all_reduce = None
         self._accumulate_gradient = False
-        self._find_unused_parameters = True
+        self._find_unused_parameters = False
         self._setup_config()
 
         global DIST_UT_PORT
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
index 9bc48ac0a1b2d..eae19afb2ef86 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
@@ -25,12 +25,15 @@
 
 
 def count_of_sparse_all_reduce_calls(file_name):
-    cmd = 'grep sparse_all_reduce_op_handle ' + file_name + ' | grep in_numel | wc -l'
+    # NOTE(Aurelius84): The log file contains some binary contents that causes error
+    # while `grep`. So we add `-a` to fix it.
+    # -a, --text equivalent to --binary-files=text, make binaries equivalent to text.
+    cmd = 'grep -a sparse_all_reduce_op_handle ' + file_name + ' | grep in_numel | wc -l'
     child = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
     result = child.communicate()[0]
     print('test_info: result = ' + str(result))
 
-    # note. in python3, result is b'num', != 'num' 
+    # NOTE: in python3, result is b'num', != 'num'
     return int(result)
 
 
@@ -59,7 +62,7 @@ def tearDown(self):
             # only 1 layer use dgc now, run_step=5, rampup_begin_step=2, so 1 * (5 - 2) = 3
 
             # temp close this test. In python3 CI, the log is right, but the result
-            # has a problem, may be in multi process mode, log is not writed in time.  
+            # has a problem, may be in multi process mode, log is not written in time.
             # self.assertEqual(result, 3)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
index d666ea6740be1..7ca08bcb9d7f9 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
@@ -14,6 +14,8 @@
 
 import unittest
 import paddle
+paddle.enable_static()
+
 import os
 import paddle.fluid as fluid
 
@@ -21,18 +23,16 @@
 class TestFleetBase(unittest.TestCase):
     def setUp(self):
         os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
         os.environ["PADDLE_TRAINERS_NUM"] = "2"
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-                       "127.0.0.1:36001,127.0.0.2:36001"
+            "127.0.0.1:36001,127.0.0.2:36001"
 
     def test_ps_minimize(self):
         import paddle
         import paddle.distributed.fleet as fleet
 
-        os.environ["TRAINING_ROLE"] = "PSERVER"
-        os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["PADDLE_TRAINER_ID"] = "1"
 
         input_x = paddle.fluid.layers.data(
             name="x", shape=[32], dtype='float32')
@@ -47,24 +47,26 @@ def test_ps_minimize(self):
 
         role = fleet.PaddleCloudRoleMaker(is_collective=False)
         fleet.init(role)
+
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = False
+        strategy.a_sync_configs = {"launch_barrier": False}
+
         optimizer = paddle.optimizer.SGD(learning_rate=0.001)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
+        exe.run(paddle.static.default_startup_program())
         pe = fluid.ParallelExecutor(use_cuda=False, loss_name=avg_cost.name)
         compiled_prog = fluid.compiler.CompiledProgram(
             fluid.default_main_program())
-        self.assertRaises(
-            Exception,
-            fleet.save_inference_model,
-            dirname='/tmp/',
-            feeded_var_names=['x', 'y'],
-            target_vars=[avg_cost],
-            executor=pe)
+
+        fleet.fleet.save(dirname="/tmp", feed=['x', 'y'], fetch=[avg_cost])
+        fleet.fleet.save(
+            dirname="/tmp", feed=[input_x, input_y], fetch=[avg_cost])
+        fleet.fleet.save(dirname="/tmp")
 
         self.assertRaises(
             Exception,
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
index 05da44cd06133..628f1db80d2d4 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
@@ -80,15 +80,17 @@ def gen_data():
                 cost_val = exe.run(feed=gen_data(), fetch_list=[avg_cost.name])
                 print("cost of step[{}] = {}".format(i, cost_val))
 
-        proc_a = launch_func(node_func, node_a)
-        proc_a.start()
+        # rank 1
+        proc_b = launch_func(node_func, node_b)
+        proc_b.start()
 
+        # rank 0, for wait server ready coverage
         # just for coverage
-        for key in node_b:
-            os.environ[key] = node_b[key]
+        for key in node_a:
+            os.environ[key] = node_a[key]
         node_func()
 
-        proc_a.join()
+        proc_b.join()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py
new file mode 100644
index 0000000000000..604109b262d6c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+
+paddle.enable_static()
+
+
+class TestFleetMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
+
+    def test_pipeline_optimizer(self):
+        import paddle.distributed.fleet as fleet
+        import paddle.distributed.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.without_graph_optimization = True
+
+        optimizer = paddle.fluid.optimizer.Adam(0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_launch_coverage.py b/python/paddle/fluid/tests/unittests/test_launch_coverage.py
index 43613928585e7..9fbf27e3c1d06 100644
--- a/python/paddle/fluid/tests/unittests/test_launch_coverage.py
+++ b/python/paddle/fluid/tests/unittests/test_launch_coverage.py
@@ -24,6 +24,7 @@
 
 from argparse import ArgumentParser, REMAINDER
 from paddle.distributed.utils import _print_arguments, get_gpus, get_cluster_from_args
+from paddle.distributed.fleet.launch_utils import find_free_ports
 
 
 def _parse_args():
@@ -115,6 +116,9 @@ def test_gpus(self):
         args.use_paddlecloud = True
         cluster, pod = get_cluster_from_args(args, "0")
 
+    def test_find_free_ports(self):
+        find_free_ports(2)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py
index c48ec2a4fb458..31c68b88b86a7 100644
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
@@ -50,15 +50,30 @@ def setUp(self):
             'keepdim': self.keepdim,
             'reduce_all': self.reduce_all
         }
+        self.user_defined_grads = None
+        self.user_defined_grad_outputs = None
+        self.set_attrs_addition()
 
     def set_attrs(self):
         pass
 
+    def set_attrs_addition(self):
+        pass
+
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], ['Out'])
+        self.check_grad(
+            ['X'], ['Out'],
+            user_defined_grads=self.user_defined_grads,
+            user_defined_grad_outputs=self.user_defined_grad_outputs)
+
+    def calc_grad(self):
+        dy = np.ones(1, dtype=self.dtype)
+        x = self.inputs['X']
+        y = self.outputs['Out']
+        return dy * np.exp(x - y)
 
 
 class TestLogsumexp_shape(TestLogsumexp):
@@ -75,6 +90,11 @@ class TestLogsumexp_axis_all(TestLogsumexp):
     def set_attrs(self):
         self.axis = [0, 1, 2, 3]
 
+    def set_attrs_addition(self):
+        if paddle.fluid.core.is_compiled_with_rocm():
+            self.user_defined_grads = [self.calc_grad()]
+            self.user_defined_grad_outputs = [np.ones(1, dtype=self.dtype)]
+
 
 class TestLogsumexp_keepdim(TestLogsumexp):
     def set_attrs(self):
@@ -85,6 +105,11 @@ class TestLogsumexp_reduce_all(TestLogsumexp):
     def set_attrs(self):
         self.reduce_all = True
 
+    def set_attrs_addition(self):
+        if paddle.fluid.core.is_compiled_with_rocm():
+            self.user_defined_grads = [self.calc_grad()]
+            self.user_defined_grad_outputs = [np.ones(1, dtype=self.dtype)]
+
 
 class TestLogsumexpError(unittest.TestCase):
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index 3a5c43b2bab3e..be2a6a653cc6f 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -412,11 +412,10 @@ def test_save_load_complex_object_dygraph_save(self):
         ]
         obj2 = {'k1': obj1, 'k2': state_dict, 'epoch': 123}
         obj3 = (paddle.randn(
-            [5, 4], dtype='float32'), np.ndarray(
-                [3, 4], dtype="float32"), {
-                    "state_dict": state_dict,
-                    "opt": state_dict
-                })
+            [5, 4], dtype='float32'), np.random.randn(3, 4).astype("float32"), {
+                "state_dict": state_dict,
+                "opt": state_dict
+            })
         obj4 = (np.random.randn(5, 6), (123, ))
 
         path1 = "test_save_load_any_complex_object_dygraph/obj1"
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
index 8b508d5c9ae79..7385da56beab3 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
@@ -19,6 +19,7 @@
 import os
 import sys
 import six
+import platform
 
 import paddle
 import paddle.nn as nn
@@ -162,12 +163,13 @@ def test_save_load_lod_tensor(self):
         with self.assertRaises(NotImplementedError):
             path = 'test_save_load_error/temp'
             paddle.save({}, path, use_binary_format=True)
-
-        with self.assertRaises(ValueError):
-            path = 'test_save_load_error/temp'
-            with open(path, "w") as f:
-                f.write('\0')
-            paddle.load(path)
+        # On the Windows platform, when parsing a string that can't be parsed as a `Program`, `desc_.ParseFromString` has a timeout risk.
+        if 'Windows' != platform.system():
+            with self.assertRaises(ValueError):
+                path = 'test_save_load_error/temp'
+                with open(path, "w") as f:
+                    f.write('\0')
+                paddle.load(path)
 
         with self.assertRaises(ValueError):
             temp_lod = fluid.core.LoDTensor()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
index fa571bde5e43b..3c45b2c795037 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
@@ -30,6 +30,7 @@ def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
         self._dygraph = True
+        self._find_unused_parameters = True
 
     def test_net(self):
         if fluid.core.is_compiled_with_cuda():
@@ -46,6 +47,7 @@ def _setup_config(self):
         self._nccl2_mode = True
         self._dygraph = True
         self._use_fleet_api = True
+        self._find_unused_parameters = True
 
 
 class TestFleetDygraphControlFlowSameAccGrad(TestDygraphControlFlowSame):
@@ -54,6 +56,7 @@ def _setup_config(self):
         self._nccl2_mode = True
         self._dygraph = True
         self._accumulate_gradient = True
+        self._find_unused_parameters = True
 
 
 class TestDygraphControlFlowDiff(TestDistBase):
@@ -61,6 +64,7 @@ def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
         self._dygraph = True
+        self._find_unused_parameters = True
 
     def test_net(self):
         if fluid.core.is_compiled_with_cuda():
@@ -77,6 +81,7 @@ def _setup_config(self):
         self._nccl2_mode = True
         self._dygraph = True
         self._use_fleet_api = True
+        self._find_unused_parameters = True
 
 
 class TestFleetDygraphControlFlowDiffAccGrad(TestDygraphControlFlowDiff):
@@ -85,6 +90,7 @@ def _setup_config(self):
         self._nccl2_mode = True
         self._dygraph = True
         self._accumulate_gradient = True
+        self._find_unused_parameters = True
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
index 5491b451368c8..f3cd97ee1ec86 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -17,8 +17,11 @@
 import unittest
 import time
 import paddle.fluid as fluid
+import copy
+import os
+import subprocess
 
-from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, start_local_trainers
+from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
 
 
 def get_cluster_from_args(selected_gpus):
@@ -46,6 +49,55 @@ def get_gpus(selected_gpus):
     return selected_gpus
 
 
+def start_local_trainers(cluster,
+                         pod,
+                         training_script,
+                         training_script_args,
+                         log_dir=None):
+    current_env = copy.copy(os.environ.copy())
+    #paddle broadcast ncclUniqueId use socket, and
+    #proxy maybe make trainers unreachable, so delete them.
+    #if we set them to "", grpc will log error message "bad uri"
+    #so just delete them.
+    current_env.pop("http_proxy", None)
+    current_env.pop("https_proxy", None)
+
+    procs = []
+    for t in pod.trainers:
+        proc_env = {
+            "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in t.gpus]),
+            "PADDLE_TRAINER_ID": "%d" % t.rank,
+            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
+            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
+        }
+
+        current_env.update(proc_env)
+
+        print("trainer proc env:{}".format(current_env))
+
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            cmd = "python -m coverage run --branch -p " + training_script
+        else:
+            cmd = "python -u " + training_script
+
+        print("start trainer proc:{} env:{}".format(cmd, proc_env))
+
+        fn = None
+
+        proc = subprocess.Popen(cmd.split(" "), env=current_env)
+
+        tp = TrainerProc()
+        tp.proc = proc
+        tp.rank = t.rank
+        tp.log_fn = fn
+        tp.cmd = cmd
+
+        procs.append(tp)
+
+    return procs
+
+
 class TestMultipleGpus(unittest.TestCase):
     def run_mnist_2gpu(self, target_file_name):
         if not fluid.core.is_compiled_with_cuda(
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index 782d2304619f2..0c55e135721ce 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -31,6 +31,7 @@ def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
         self._dygraph = True
+        self._find_unused_parameters = True
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
similarity index 89%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py
rename to python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
index f3b89d694f70b..1d06e168208b2 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
@@ -24,6 +24,9 @@ class TestHybridPipeParallel(TestMultipleGpus):
     def test_hybrid_parallel_pp_layer(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_layer.py')
 
+    def test_hybrid_parallel_pp_tuple_inputs(self):
+        self.run_mnist_2gpu('hybrid_parallel_pp_embedding.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
rename to python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
index d64aa510f4e1a..a34982ef3dd67 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
@@ -17,6 +17,7 @@
 import six
 import paddle.fluid as fluid
 import paddle
+import os
 
 
 def enable_parallel_ssa_executor(enabled=True):
@@ -65,6 +66,9 @@ def run_impl(self, use_gpu, dev_cnt, is_training, use_experimental_executor,
             if fluid.core.globals()[
                     'FLAGS_enable_parallel_graph'] and not use_gpu:
                 return
+            # windows has only 1 GPU
+            if use_gpu and dev_cnt > 1 and os.name == "nt":
+                return
         else:
             if use_gpu:
                 return
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
index 7f8294ad0efe7..f62e160673f8d 100644
--- a/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
@@ -22,7 +22,7 @@
 
 class TestPipelineParallel(TestMultipleGpus):
     def test_pipeline_parallel(self):
-        self.run_mnist_2gpu('hybrid_parallel_pp_model.py')
+        self.run_mnist_2gpu('hybrid_parallel_pp_alexnet.py')
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 83f02b629d7ac..b3671327ca295 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -248,6 +248,21 @@ def test_to_tensor_change_place(self):
                 a = paddle.to_tensor(a, place=paddle.CUDAPinnedPlace())
                 self.assertEqual(a.place.__repr__(), "CUDAPinnedPlace")
 
+    def test_to_tensor_with_lodtensor(self):
+        if core.is_compiled_with_cuda():
+            a_np = np.random.rand(1024, 1024)
+            with paddle.fluid.dygraph.guard(core.CPUPlace()):
+                lod_tensor = core.LoDTensor()
+                lod_tensor.set(a_np, core.CPUPlace())
+                a = paddle.to_tensor(lod_tensor)
+                self.assertTrue(np.array_equal(a_np, a.numpy()))
+
+            with paddle.fluid.dygraph.guard(core.CUDAPlace(0)):
+                lod_tensor = core.LoDTensor()
+                lod_tensor.set(a_np, core.CUDAPlace(0))
+                a = paddle.to_tensor(lod_tensor)
+                self.assertTrue(np.array_equal(a_np, a.numpy()))
+
     def test_to_variable(self):
         with fluid.dygraph.guard():
             var = fluid.dygraph.to_variable(self.array, name="abc")
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index 690ac46e563ef..71051689dbc15 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -15,12 +15,15 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_, in_dygraph_mode
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
 import numpy as np
 
+paddle.enable_static()
+
 
 class TestVariable(unittest.TestCase):
     def test_np_dtype_convert(self):
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index 844115d4acecc..24c463ebfc9a1 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -36,7 +36,8 @@ def YoloBox(x, img_size, attrs):
     clip_bbox = attrs['clip_bbox']
     scale_x_y = attrs['scale_x_y']
     bias_x_y = -0.5 * (scale_x_y - 1.)
-    input_size = downsample * h
+    input_h = downsample * h
+    input_w = downsample * w
 
     x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
 
@@ -50,7 +51,7 @@ def YoloBox(x, img_size, attrs):
 
     anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
     anchors_s = np.array(
-        [(an_w / input_size, an_h / input_size) for an_w, an_h in anchors])
+        [(an_w / input_w, an_h / input_h) for an_w, an_h in anchors])
     anchor_w = anchors_s[:, 0:1].reshape((1, an_num, 1, 1))
     anchor_h = anchors_s[:, 1:2].reshape((1, an_num, 1, 1))
     pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
@@ -191,5 +192,19 @@ def test_static(self):
         assert boxes is not None and scores is not None
 
 
+class TestYoloBoxOpHW(TestYoloBoxOp):
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int(len(self.anchors) // 2)
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.clip_bbox = False
+        self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 9)
+        self.imgsize_shape = (self.batch_size, 2)
+        self.scale_x_y = 1.
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
index 9bea33e484e19..d33cb2157b03b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
@@ -13,13 +13,18 @@
 # limitations under the License.
 
 from __future__ import print_function
+import unittest
 import sys
 sys.path.append("..")
-import unittest
+
 import numpy as np
-from op_test import OpTest
+
 import paddle
 import paddle.fluid as fluid
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+
+paddle.enable_static()
 
 
 def gather_numpy(x, index, axis):
@@ -29,37 +34,12 @@ def gather_numpy(x, index, axis):
     return gather
 
 
-class TestGatherOp(OpTest):
-    def setUp(self):
-        self.op_type = "gather"
-        self.config()
-        xnp = np.random.random(self.x_shape).astype(self.x_type)
-        self.inputs = {
-            'X': xnp,
-            'Index': np.array(self.index).astype(self.index_type)
-        }
-        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-    def config(self):
-        """
-        For multi-dimension input
-        """
-        self.x_shape = (10, 20)
-        self.x_type = "float64"
-        self.index = [1, 3, 5]
-        self.index_type = "int32"
-
-
-class TestXPUGatherOp(OpTest):
+class TestXPUGatherOp(XPUOpTest):
     def setUp(self):
+        self.dtype = "float32"
         self.op_type = "gather"
-        self.dtype = np.float32
+        self.use_xpu = True
+        self.use_mkldnn = False
         self.attrs = {'use_xpu': True}
 
         self.config()
@@ -71,12 +51,12 @@ def setUp(self):
         self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
 
     def test_check_output(self):
-        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+        if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
             self.check_output_with_place(place)
 
     def test_check_grad(self):
-        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+        if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
             self.check_grad_with_place(place, ['X'], 'Out')
 
@@ -85,7 +65,7 @@ def config(self):
         For multi-dimension input
         """
         self.x_shape = (10, 20)
-        self.x_type = self.dtype
+        self.x_type = "float32"
         self.index = [1, 3, 5]
         self.index_type = "int32"
 
@@ -150,5 +130,14 @@ def config(self):
         self.index_type = "int32"
 
 
+class TestCase7(TestXPUGatherOp):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {'use_xpu': True, 'overwrite': True}
+        self.x_type = "float32"
+        self.index = [1, 3]
+        self.index_type = "int64"
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py
new file mode 100644
index 0000000000000..c4e1363bd9c94
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py
@@ -0,0 +1,97 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+import sys
+sys.path.append("..")
+import numpy as np
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+
+paddle.enable_static()
+
+
+def ref_logsumexp(x, axis=None, keepdim=False, reduce_all=False):
+    if isinstance(axis, int):
+        axis = (axis, )
+    elif isinstance(axis, list):
+        axis = tuple(axis)
+    if reduce_all:
+        axis = None
+    out = np.log(np.exp(x).sum(axis=axis, keepdims=keepdim))
+    return out
+
+
+class XPUTestLogsumexp(XPUOpTest):
+    def setUp(self):
+        self.op_type = 'logsumexp'
+        self.shape = [2, 3, 4, 5]
+        self.dtype = 'float32'
+        self.axis = [-1]
+        self.keepdim = False
+        self.reduce_all = False
+        self.set_attrs()
+
+        np.random.seed(10)
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out = ref_logsumexp(x, self.axis, self.keepdim, self.reduce_all)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {
+            'axis': self.axis,
+            'keepdim': self.keepdim,
+            'reduce_all': self.reduce_all
+        }
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        pass
+
+
+class TestLogsumexp_shape(XPUTestLogsumexp):
+    def set_attrs(self):
+        self.shape = [4, 5, 6]
+
+
+class TestLogsumexp_axis(XPUTestLogsumexp):
+    def set_attrs(self):
+        self.axis = [0, -1]
+
+
+class TestLogsumexp_axis_all(XPUTestLogsumexp):
+    def set_attrs(self):
+        self.axis = [0, 1, 2, 3]
+
+
+class TestLogsumexp_keepdim(XPUTestLogsumexp):
+    def set_attrs(self):
+        self.keepdim = True
+
+
+class TestLogsumexp_reduce_all(XPUTestLogsumexp):
+    def set_attrs(self):
+        self.reduce_all = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
new file mode 100644
index 0000000000000..242b5b14db2bc
--- /dev/null
+++ b/python/paddle/fluid/variable_index.py
@@ -0,0 +1,306 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import numpy as np
+from . import unique_name
+from . import core
+
+MAX_INTEGER = 2**31 - 1
+
+
+def replace_ellipsis(var, item):
+    from .framework import Variable
+    # Use slice(None) to replace Ellipsis.
+    # For var, var.shape = [3,4,5,6]
+    #
+    #   var[..., 1:2] -> var[:, :, :, 1:2]
+    #   var[0, ...] -> var[0]
+    #   var[0, ..., 1:2] -> var[0, :, :, 1:2]
+
+    item = list(item)
+
+    # Remove Variable to skip bug when counting Ellipsis
+    item_remove_var = [ele for ele in item if not isinstance(ele, Variable)]
+    ell_count = item_remove_var.count(Ellipsis)
+    if ell_count == 0:
+        return item
+    elif ell_count > 1:
+        raise IndexError("An index can only have a single ellipsis ('...')")
+
+    ell_idx = item.index(Ellipsis)
+
+    if ell_idx == len(item) - 1:
+        return item[:-1]
+    else:
+        item[ell_idx:ell_idx + 1] = [slice(None)] * (
+            len(var.shape) - len(item) + 1)
+
+    return item
+
+
+def is_integer_or_scalar_tensor(ele):
+    from .framework import Variable
+    if isinstance(ele, int):
+        return True
+    elif isinstance(ele, Variable):
+        if len(ele.shape) == 1 and ele.shape[0] == 1:
+            return True
+    return False
+
+
+def deal_attrs(attrs, attr, attr_name, tensor_attr_name, inputs, infer_flags):
+    from .framework import Variable
+    from .layers import utils
+
+    if utils._contain_var(attr):
+        inputs[tensor_attr_name] = utils._convert_to_tensor_list(
+            attr, dtype="int64")
+        for i, dim in enumerate(attr):
+            if isinstance(dim, Variable):
+                attrs[attr_name].append(-1)
+                infer_flags[i] = -1
+            else:
+                attrs[attr_name].append(dim)
+    else:
+        attrs[attr_name] = attr
+
+
+def _getitem_impl_(var, item):
+    """
+    Slice the variable.
+
+    Args:
+        item(int/slice/tuple) : the index.
+
+    Returns:
+        Sliced variable
+    """
+    from .framework import default_main_program
+
+    if not isinstance(item, tuple):
+        item = (item, )
+
+    decrease_axes = []
+    axes = []
+    starts = []
+    ends = []
+    steps = []
+    reverse_axis = []
+
+    use_strided_slice = False
+
+    for dim, slice_item in enumerate(item):
+        if is_integer_or_scalar_tensor(slice_item):
+            decrease_axes.append(dim)
+            start = slice_item
+            step = 1
+            end = slice_item + 1 if slice_item != -1 else MAX_INTEGER
+
+        elif isinstance(slice_item, slice):
+            start = slice_item.start
+            end = slice_item.stop
+            step = slice_item.step
+
+            if start is None and end is None and step is None:
+                continue
+
+            step = 1 if step is None else step
+
+            if start is None and end is None:
+                assert (step == -1)
+                reverse_axis.append(dim)
+                continue
+
+            start = 0 if start is None else start
+            end = MAX_INTEGER if end is None else end
+
+        else:
+            raise IndexError(
+                "Valid index accept int or slice or ellipsis, but received {}.".
+                format(slice_item))
+
+        axes.append(dim)
+        starts.append(start)
+        ends.append(end)
+        steps.append(step)
+        use_strided_slice = True if step != 1 else use_strided_slice
+
+    inputs = {'Input': [var]}
+    attrs = {
+        'axes': axes,
+        'starts': [],
+        'ends': [],
+        'decrease_axis': decrease_axes
+    }
+    if use_strided_slice:
+        attrs['strides'] = []
+
+    infer_flags = [1] * len(axes)
+    deal_attrs(attrs, starts, "starts", "StartsTensorList", inputs, infer_flags)
+    deal_attrs(attrs, ends, "ends", "EndsTensorList", inputs, infer_flags)
+    deal_attrs(attrs, steps, "strides", "StridesTensorList", inputs,
+               infer_flags)
+    attrs['infer_flags'] = infer_flags
+
+    out = var
+    if len(axes) > 0:
+        target_block = default_main_program().current_block()
+        op_type = "strided_slice" if use_strided_slice else "slice"
+
+        slice_out_var = target_block.create_var(
+            name=unique_name.generate_with_ignorable_key(var.name + "_" +
+                                                         op_type),
+            dtype=var.dtype)
+        target_block.append_op(
+            type=op_type,
+            inputs=inputs,
+            outputs={'Out': [slice_out_var]},
+            attrs=attrs)
+        out = slice_out_var
+
+    if len(reverse_axis) > 0:
+        from .layers.tensor import reverse
+        out = reverse(out, axis=reverse_axis)
+
+    return out
+
+
+def _setitem_impl_(var, item, value):
+    from .framework import default_main_program, Variable
+
+    inputs = {'Input': var}
+
+    # 1. Parse item
+    if not isinstance(item, tuple):
+        item = (item, )
+
+    decrease_axes = []
+    axes = []
+    starts = []
+    ends = []
+    steps = []
+
+    item = replace_ellipsis(var, item)
+
+    for dim, slice_item in enumerate(item):
+        if is_integer_or_scalar_tensor(slice_item):
+            decrease_axes.append(dim)
+            start = slice_item
+            end = slice_item + 1 if slice_item != -1 else MAX_INTEGER
+            step = 1
+
+        elif isinstance(slice_item, slice):
+            start = slice_item.start
+            end = slice_item.stop
+            step = slice_item.step
+
+            if start is None and end is None and step is None:
+                continue
+
+            step = 1 if step is None else step
+
+            if not isinstance(step, Variable) and step == 0:
+                raise ValueError(
+                    "When assign a value to a paddle.Tensor, step can not be 0, "
+                    "but received step is {}.".format(step))
+
+            if isinstance(step, Variable) and (start is None or end is None):
+                raise ValueError(
+                    "When assign a value to a paddle.Tensor, it's not supported that "
+                    "the start or end is None when the type of step is paddle.Tensor."
+                )
+
+            if start is None:
+                start = 0 if step > 0 else MAX_INTEGER
+
+            if end is None:
+                end = MAX_INTEGER if step > 0 else (0 - MAX_INTEGER)
+        else:
+            raise IndexError(
+                "Valid index accept int or slice or ellipsis, but received {}.".
+                format(slice_item))
+
+        axes.append(dim)
+        starts.append(start)
+        ends.append(end)
+        steps.append(step)
+
+    attrs = {
+        'axes': axes,
+        'starts': starts,
+        'ends': ends,
+        'steps': steps,
+        'decrease_axes': decrease_axes
+    }
+
+    from .layers import utils
+    if utils._contain_var(starts):
+        inputs['StartsTensorList'] = utils._convert_to_tensor_list(starts)
+        del attrs['starts']
+    if utils._contain_var(ends):
+        inputs['EndsTensorList'] = utils._convert_to_tensor_list(ends)
+        del attrs['ends']
+    if utils._contain_var(steps):
+        inputs['StepsTensorList'] = utils._convert_to_tensor_list(steps)
+        del attrs['steps']
+
+    # 2. Parse value
+    dtype = var.dtype
+    attrs['dtype'] = dtype
+
+    from .data_feeder import convert_dtype
+    #  2.1 value is an integer of float
+    if isinstance(value, (int, float)):
+        value = np.array([value]).astype(convert_dtype(dtype))
+
+    #  2.2 value is a np.ndarray
+    if isinstance(value, np.ndarray):
+        shape = list(value.shape)
+        if dtype == core.VarDesc.VarType.BOOL:
+            value_name = "bool_values"
+            values = [bool(v) for v in value.flat]
+        elif dtype == core.VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in value.flat]
+        elif dtype == core.VarDesc.VarType.FP64:
+            value_name = "fp64_values"
+            values = [float(v) for v in value.flat]
+        elif dtype == core.VarDesc.VarType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in value.flat]
+        elif dtype == core.VarDesc.VarType.INT64:
+            value_name = "int64_values"
+            values = [int(v) for v in value.flat]
+        else:
+            raise TypeError(
+                "When assign a numpy.ndarray, integer or float to a paddle.Tensor, "
+                "the data type of the paddle.Tensor must be bool, float32, int32 or int64, but "
+                "received %s." % convert_dtype(dtype))
+        attrs[value_name] = values
+        attrs["shape"] = shape
+
+    elif isinstance(value, Variable):
+        inputs["ValueTensor"] = value
+    else:
+        raise TypeError(
+            "Only support to assign an integer, float, numpy.ndarray or "
+            "paddle.Tensor to a paddle.Tensor, but received {}".format(
+                type(value)))
+
+    cur_block = default_main_program().current_block()
+    cur_block.append_op(
+        type="set_value", inputs=inputs, outputs={'Out': var}, attrs=attrs)
+
+    return var
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index 17eaa82cd8b6a..93056a60c371c 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -96,11 +96,12 @@ def set_grad_enabled(mode):
 
     Examples:
         .. code-block:: python
+            import paddle
             x = paddle.ones([3, 2])
             x.stop_gradient = False
-            with torch.set_grad_enabled(False):
+            with paddle.set_grad_enabled(False):
                 y = x * 2
-                with torch.set_grad_enabled(True):
+                with paddle.set_grad_enabled(True):
                     z = x * 2
             print(y.stop_gradient)   # True
             print(z.stop_gradient)   # False
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 493574c5bef47..1705db50d391a 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -491,12 +491,12 @@ def _save_binary_var(obj, path):
             format(type(obj)))
 
 
-def save(obj, path, protocol=2, **configs):
+def save(obj, path, protocol=4, **configs):
     '''
     Save an object to the specified path.
     
     .. note::
-        Now supports saving ``state_dict`` of Layer/Optimizer, Layer, Tensor and nested structure containing Tensor.
+        Now supports saving ``state_dict`` of Layer/Optimizer, Layer, Tensor and nested structure containing Tensor, Program.
 
     .. note::
         Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file, 
@@ -512,7 +512,7 @@ def save(obj, path, protocol=2, **configs):
         path(str) : The path of the object to be saved. 
           If saved in the current directory, the input path string will be used as the file name. 
         protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
-                                 Default: 2
+                                 Default: 4
         **configs(dict, optional): optional keyword arguments. The following options are currently supported:
           use_binary_format(bool): When the saved object is static graph variable, you can specify ``use_binary_for_var``. 
           If True, save the file in the c++ binary format when saving a single static graph variable; otherwise, save it in pickle format.
@@ -544,7 +544,18 @@ def save(obj, path, protocol=2, **configs):
             # save weight of emb
             paddle.save(emb.weight, "emb.weight.pdtensor")
 
-            # example 2: static graph
+            # example 2: Save multiple state_dict at the same time
+            from paddle import nn
+            from paddle.optimizer import Adam
+
+            layer = paddle.nn.Linear(3, 4)
+            adam = Adam(learning_rate=0.001, parameters=layer.parameters())
+            obj = {'model': layer.state_dict(), 'opt': adam.state_dict(), 'epoch': 100}
+            path = 'example/model.pdparams'
+            paddle.save(obj, path)
+
+
+            # example 3: static graph
             import paddle
             import paddle.static as static
 
@@ -570,6 +581,18 @@ def save(obj, path, protocol=2, **configs):
             # save/load state_dict
             path_state_dict = 'temp/model.pdparams'
             paddle.save(prog.state_dict("param"), path_tensor)
+
+            # example 4: save program
+            import paddle
+
+            paddle.enable_static()
+
+            data = paddle.static.data(
+                name='x_static_save', shape=(None, 224), dtype='float32')
+            y_static = z = paddle.static.nn.fc(data, 10)
+            main_program = paddle.static.default_main_program()
+            path = "example/main_program.pdmodel"
+            paddle.save(main_program, path)
     '''
     # 1. input check
     filename = os.path.basename(path)
@@ -667,7 +690,7 @@ def load(path, **configs):
     Load an object can be used in paddle from specified path.
 
     .. note::
-        Now supports loading ``state_dict`` of Layer/Optimizer, Layer, Tensor and nested structure containing Tensor.
+        Now supports loading ``state_dict`` of Layer/Optimizer, Layer, Tensor and nested structure containing Tensor, Program.
 
     .. note::
         In order to use the model parameters saved by paddle more efficiently, 
@@ -714,8 +737,6 @@ def load(path, **configs):
     Examples:
         .. code-block:: python
 
-            import paddle
-
             # example 1: dynamic graph
             import paddle
             emb = paddle.nn.Embedding(10, 10)
@@ -744,7 +765,19 @@ def load(path, **configs):
             load_weight = paddle.load("emb.weight.pdtensor")
 
 
-            # example 2: static graph
+            # example 2: Load multiple state_dict at the same time
+            from paddle import nn
+            from paddle.optimizer import Adam
+
+            layer = paddle.nn.Linear(3, 4)
+            adam = Adam(learning_rate=0.001, parameters=layer.parameters())
+            obj = {'model': layer.state_dict(), 'opt': adam.state_dict(), 'epoch': 100}
+            path = 'example/model.pdparams'
+            paddle.save(obj, path)
+            obj_load = paddle.load(path)
+
+
+            # example 3: static graph
             import paddle
             import paddle.static as static
 
@@ -773,6 +806,22 @@ def load(path, **configs):
             paddle.save(prog.state_dict("param"), path_tensor)
             load_state_dict = paddle.load(path_tensor)
 
+
+            # example 4: load program
+            import paddle
+
+            paddle.enable_static()
+
+            data = paddle.static.data(
+                name='x_static_save', shape=(None, 224), dtype='float32')
+            y_static = z = paddle.static.nn.fc(data, 10)
+            main_program = paddle.static.default_main_program()
+            path = "example/main_program.pdmodel"
+            paddle.save(main_program, path)
+            load_main = paddle.load(path)
+            print(load_main)
+
+
     '''
 
     if os.path.isfile(path):
diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index cd4b35ea29a83..61ae8b42d63a9 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -25,10 +25,7 @@
 
 from .progressbar import ProgressBar
 
-__all__ = [
-    'Callback', 'ProgBarLogger', 'ModelCheckpoint', 'VisualDL', 'LRScheduler',
-    'EarlyStopping', 'ReduceLROnPlateau'
-]
+__all__ = []
 
 
 def config_callbacks(callbacks=None,
diff --git a/python/paddle/hub.py b/python/paddle/hub.py
new file mode 100644
index 0000000000000..acdb28cb6f08d
--- /dev/null
+++ b/python/paddle/hub.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .hapi.hub import list  # noqa: F401
+from .hapi.hub import help  # noqa: F401
+from .hapi.hub import load  # noqa: F401
+
+__all__ = [  #noqa
+    'list', 'help', 'load'
+]
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index c4f4b6cbc1f7c..7cf3f94872de1 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -232,10 +232,8 @@ def weight_norm(*args):
            'MaxPool3D',
            'AdaptiveMaxPool2D',
            'Hardshrink',
-           'clip',
            'Softplus',
            'KLDivLoss',
-           'clip_by_norm',
            'AvgPool2D',
            'L1Loss',
            'LeakyReLU',
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index aa0bd8a8c5e3d..eeb0062587646 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1411,6 +1411,13 @@ def cross_entropy(input,
                 out = core.ops.elementwise_mul(out, weight_gather_reshape)
 
             else:
+                label_min = paddle.min(label)
+                label_max = paddle.max(label)
+                if label_min < 0 or label_max >= input.shape[-1]:
+                    raise ValueError(
+                        'Expected 0 <= label_value < class_dimension({}), but got {} <= label_value <= {} '.
+                        format(input.shape[-1],
+                               label_min.numpy(), label_max.numpy()))
                 weight_gather = core.ops.gather_nd(weight, label)
                 input_shape = list(label.shape)
                 weight_gather_reshape = reshape(
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 2de065d62a4f8..eecea3034a752 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -199,7 +199,7 @@ class Conv1D(_ConvNd):
     * :math:`X`: Input value, a ``Tensor`` with 'NCL' format or 'NLC' format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCK] .
     * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`b`: Bias value, a 1-D ``Tensor`` with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
@@ -259,11 +259,15 @@ class Conv1D(_ConvNd):
             is not set, the bias is initialized zero. Default: None.
 
     Attribute:
+
         **weight** (Parameter): the learnable weights of filter of this layer.
+
         **bias** (Parameter or None): the learnable bias of this layer.
 
     Shape:
         - x: 3-D tensor with shape: (batch, in_channels, length) or (batch, length, in_channels).
+        - weight: 3-D tensor with shape: (out_channels, in_channels, kernel_size)
+        - bias: 1-D tensor with shape: (out_channels)
         - output: 3-D tensor with same shape as input x.
     
     Raises:
@@ -444,6 +448,8 @@ class Conv1DTranspose(_ConvNd):
     Shape:
 
         - x(Tensor): 3-D tensor with shape (batch, in_channels, length) when data_format is "NCL" or shape (batch, length, in_channels) when data_format is "NLC".
+        - weight(Tensor): 3-D tensor with shape (in_channels, out_channels, kernel_length).
+        - bias(Tensor): 1-D tensor with shape (out_channels).
         - output_size(int|tuple|list, optional): The output image size. If output size is a tuple/list, it must contain one integer, (feature_length). None if use kernel_size, padding, output_padding and stride to calculate output_size. If output_size and kernel_size are specified at the same time, They should follow the formula above. Default: None. output_size and kernel_size should not be None at the same time.
         - output(Tensor): 3-D tensor with same shape as input x.
 
@@ -540,7 +546,7 @@ class Conv2D(_ConvNd):
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
     * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`b`: Bias value, a 1-D ``Tensor`` with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
     
@@ -590,6 +596,10 @@ class Conv2D(_ConvNd):
 
         - x: :math:`(N, C_{in}, H_{in}, W_{in})`
 
+        - weight: :math:`(C_{out}, C_{in}, K_{h}, K_{w})`
+
+        - bias: :math:`(C_{out})`
+
         - output: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
@@ -676,15 +686,15 @@ class Conv2DTranspose(_ConvNd):
     filter, and dilations, strides, paddings. Input and output
     are in NCHW format. Where N is batch size, C is the number of feature map,
     H is the height of the feature map, and W is the width of the feature map.
-    Filter's shape is [MCHW] , where M is the number of input feature map,
-    C is the number of output feature map, H is the height of the filter,
+    Filter's shape is [CMHW] , where C is the number of input feature map,
+    M is the number of output feature map, H is the height of the filter,
     and W is the width of the filter. If the groups is greater than 1,
     C will equal the number of input feature map divided by the groups.
     If bias attribution and activation type are provided, bias is added to
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
     The details of convolution transpose layer, please refer to the following explanation and references
-    `conv2dtranspose <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_ .
+    `conv2dtranspose <https://arxiv.org/pdf/1603.07285.pdf>`_ .
     For each input :math:`X`, the equation is:
 
     ..  math::
@@ -694,9 +704,9 @@ class Conv2DTranspose(_ConvNd):
     Where:
 
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
-    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
+    * :math:`W`: Filter value, a ``Tensor`` with shape [CMHW] .
     * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`b`: Bias value, a 1-D ``Tensor`` with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
     
@@ -749,6 +759,10 @@ class Conv2DTranspose(_ConvNd):
 
         - x: :math:`(N, C_{in}, H_{in}, W_{in})`
 
+        - weight: :math:`(C_{in}, C_{out}, K_{h}, K_{w})`
+
+        - bias: :math:`(C_{out})`
+
         - output: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
@@ -851,7 +865,7 @@ class Conv3D(_ConvNd):
     * :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`b`: Bias value, a 1-D tensor with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
@@ -901,6 +915,10 @@ class Conv3D(_ConvNd):
 
         - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
 
+        - weight: :math:`(C_{out}, C_{in}, K_{d}, K_{h}, K_{w})`
+
+        - bias: :math:`(C_{out})`
+
         - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
 
         Where
@@ -995,7 +1013,7 @@ class Conv3DTranspose(_ConvNd):
     is the width of the feature. Parameters(dilations, strides, paddings) are
     two elements. These two elements represent height and width, respectively.
     The details of convolution transpose layer, please refer to the following
-    explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    explanation and references `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
     If bias attribution and activation type are provided, bias is added to
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
@@ -1008,9 +1026,9 @@ class Conv3DTranspose(_ConvNd):
     In the above equation:
 
     * :math:`X`: Input value, a tensor with NCDHW format.
-    * :math:`W`: Filter value, a tensor with MCDHW format.
+    * :math:`W`: Filter value, a tensor with CMDHW format.
     * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`b`: Bias value, a 1-D tensor with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
@@ -1077,6 +1095,10 @@ class Conv3DTranspose(_ConvNd):
 
         - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
 
+        - weight: :math:`(C_{in}, C_{out}, K_{d}, K_{h}, K_{w})`
+
+        - bias: :math:`(C_{out})`
+
         - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
 
         Where
diff --git a/python/paddle/nn/utils/spectral_norm_hook.py b/python/paddle/nn/utils/spectral_norm_hook.py
index 5ce9e0937d33d..250eb235fd7d4 100644
--- a/python/paddle/nn/utils/spectral_norm_hook.py
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
@@ -143,14 +143,14 @@ def spectral_norm(layer,
     and W is the product result of remaining dimensions.
 
     Step 2:
-    :attr:`power_iters` should be a positive integer, do following
+    :attr:`n_power_iterations` should be a positive integer, do following
     calculations with U and V for :attr:`power_iters` rounds.
 
     .. math::
 
-        \mathbf{v} := \\frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
+        \mathbf{v} := \frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
 
-        \mathbf{u} := \\frac{\mathbf{W} \mathbf{v}}{\|\mathbf{W} \mathbf{v}\|_2}
+        \mathbf{u} := \frac{\mathbf{W} \mathbf{v}}{\|\mathbf{W} \mathbf{v}\|_2}
 
     Step 3:
     Calculate :math:`\sigma(\mathbf{W})` and normalize weight values.
@@ -159,7 +159,7 @@ def spectral_norm(layer,
 
         \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v}
 
-        \mathbf{W} = \\frac{\mathbf{W}}{\sigma(\mathbf{W})}
+        \mathbf{W} = \frac{\mathbf{W}}{\sigma(\mathbf{W})}
 
 
     Refer to `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 3129029d82920..da9749722e132 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -17,6 +17,7 @@
 import multiprocessing
 import six
 import sys
+import warnings
 
 from six.moves.queue import Queue
 from six.moves import zip_longest
@@ -25,7 +26,9 @@
 import itertools
 import random
 import zlib
+
 import paddle.compat as cpt
+from paddle.fluid.reader import QUEUE_GET_TIMEOUT
 
 __all__ = []
 
@@ -584,10 +587,13 @@ def _impl():
         raise NotImplementedError(
             "The multiprocess_reader method is not supported on windows.")
 
+    # ujson is ultra fast json encoder and decoder written in pure C with bindings for Python 3.6+.
     try:
         import ujson as json
     except Exception as e:
-        sys.stderr.write("import ujson error: " + str(e) + " use json\n")
+        warnings.warn(
+            "The `ujson` module is not found, use the `json` module, `ujson` encodes and decodes faster, "
+            "you can install `ujson` through `pip install ujson`.")
         import json
 
     assert isinstance(readers, (list, tuple)) and len(readers) > 0, (
@@ -614,11 +620,20 @@ def queue_reader():
         reader_num = len(readers)
         finish_num = 0
         while finish_num < reader_num:
-            sample = queue.get()
+            try:
+                sample = queue.get(timeout=QUEUE_GET_TIMEOUT)
+            except:
+                logging.error(
+                    "multiprocess_reader failed to get data from the multiprocessing.Queue."
+                )
+                six.reraise(*sys.exc_info())
+
             if sample is None:
                 finish_num += 1
             elif sample == "":
-                raise ValueError("multiprocess reader raises an exception")
+                raise ValueError(
+                    "multiprocess_reader failed to put data into the multiprocessing.Queue."
+                )
             else:
                 yield sample
 
@@ -660,7 +675,9 @@ def pipe_reader():
                 elif sample == "":
                     conn.close()
                     conn_to_remove.append(conn)
-                    raise ValueError("multiprocess reader raises an exception")
+                    raise ValueError(
+                        "multiprocess_reader failed to send data into the multiprocessing.Pipe."
+                    )
                 else:
                     yield sample
 
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 361c0e80f90d7..e1012e7656a3d 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -118,6 +118,16 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
         place = _current_expected_place()
 
     if not isinstance(data, np.ndarray):
+
+        def _handle_diff_place_dtype(data, dtype, place, stop_gradient):
+            data.stop_gradient = stop_gradient
+            if not data.place._equals(place):
+                data = data._copy_to(place, False)
+            if dtype:
+                if convert_dtype(dtype) != convert_dtype(data.dtype):
+                    return data.astype(convert_dtype(dtype))
+            return data
+
         if np.isscalar(data) and not isinstance(data, str):
             data = np.array([data])
         elif isinstance(data, (list, tuple)):
@@ -128,13 +138,11 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
                     "this means the input data contains nested lists with different lengths. "
                 )
         elif isinstance(data, paddle.Tensor):
-            data.stop_gradient = stop_gradient
-            if not data.place._equals(place):
-                data = data._copy_to(place, False)
-            if dtype:
-                if convert_dtype(dtype) != convert_dtype(data.dtype):
-                    return data.astype(convert_dtype(dtype))
-            return data
+            return _handle_diff_place_dtype(data, dtype, place, stop_gradient)
+        elif isinstance(data, (core.Tensor, core.LoDTensor)):
+            # convert LoDTensor to VarBase first, and then process it as input VarBase
+            data = paddle.Tensor(data)
+            return _handle_diff_place_dtype(data, dtype, place, stop_gradient)
         else:
             raise TypeError(
                 "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor".
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 97826f7d5f81d..67e6c7f8e44d7 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -80,7 +80,7 @@ def concat(x, axis=0, name=None):
 
     Args:
         x(list|tuple): ``x`` is a Tensor list or Tensor tuple which is with data type bool, float16,
-            float32, float64, int32, int64. All the Tensors in ``x`` must have same data type.
+            float32, float64, int32, int64, uint8. All the Tensors in ``x`` must have same data type.
         axis(int|Tensor, optional): Specify the axis to operate on the input Tensors.
             It's a scalar with data type int or a Tensor with shape [1] and data type int32 
             or int64. The effective range is [-R, R), where R is Rank(x). When ``axis < 0``,
diff --git a/python/paddle/tests/test_download.py b/python/paddle/tests/test_download.py
index b8af7f6a80e72..4be2dde1bccb1 100644
--- a/python/paddle/tests/test_download.py
+++ b/python/paddle/tests/test_download.py
@@ -70,6 +70,13 @@ def test_get_path_from_url(self):
         for url in urls:
             get_path_from_url(url, root_dir='./test')
 
+    def test_retry_exception(self, ):
+        with self.assertRaises(RuntimeError):
+            from paddle.utils.download import _download
+            _download(
+                'www.baidu.com',
+                './test', )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
index c84950fdbc539..974943a99d8b4 100644
--- a/python/paddle/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -525,10 +525,10 @@ def test_errors(self):
             image_load('tmp.jpg', backend=1)
 
     def test_normalize(self):
-        np_img = (np.random.rand(28, 24, 3)).astype('uint8')
+        np_img = (np.random.rand(28, 24, 3) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img)
         tensor_img = F.to_tensor(pil_img)
-        tensor_img_hwc = F.to_tensor(pil_img, data_format='HWC')
+        tensor_img_hwc = F.to_tensor(pil_img, data_format='HWC') * 255
 
         mean = [0.5, 0.5, 0.5]
         std = [0.5, 0.5, 0.5]
@@ -539,17 +539,17 @@ def test_normalize(self):
 
         normalized_img_pil = F.normalize(pil_img, mean, std, data_format='HWC')
         normalized_img_np = F.normalize(
-            np_img, mean, std, data_format='HWC', to_rgb=True)
+            np_img, mean, std, data_format='HWC', to_rgb=False)
 
         np.testing.assert_almost_equal(
             np.array(normalized_img_pil), normalized_img_np)
-        np.testing.assert_almost_equal(normalized_img_tensor.numpy(),
-                                       normalized_img_np)
+        np.testing.assert_almost_equal(
+            normalized_img_tensor.numpy(), normalized_img_np, decimal=4)
 
     def test_center_crop(self):
-        np_img = (np.random.rand(28, 24, 3)).astype('uint8')
+        np_img = (np.random.rand(28, 24, 3) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img)
-        tensor_img = F.to_tensor(pil_img, data_format='CHW')
+        tensor_img = F.to_tensor(pil_img, data_format='CHW') * 255
 
         np_cropped_img = F.center_crop(np_img, 4)
         pil_cropped_img = F.center_crop(pil_img, 4)
@@ -557,23 +557,25 @@ def test_center_crop(self):
 
         np.testing.assert_almost_equal(np_cropped_img,
                                        np.array(pil_cropped_img))
-        np.testing.assert_almost_equal(np_cropped_img,
-                                       tensor_cropped_img.numpy().transpose(
-                                           (1, 2, 0)))
+        np.testing.assert_almost_equal(
+            np_cropped_img,
+            tensor_cropped_img.numpy().transpose((1, 2, 0)),
+            decimal=4)
 
     def test_pad(self):
-        np_img = (np.random.rand(28, 24, 3)).astype('uint8')
+        np_img = (np.random.rand(28, 24, 3) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img)
-        tensor_img = F.to_tensor(pil_img, 'CHW')
+        tensor_img = F.to_tensor(pil_img, 'CHW') * 255
 
         np_padded_img = F.pad(np_img, [1, 2], padding_mode='reflect')
         pil_padded_img = F.pad(pil_img, [1, 2], padding_mode='reflect')
         tensor_padded_img = F.pad(tensor_img, [1, 2], padding_mode='reflect')
 
         np.testing.assert_almost_equal(np_padded_img, np.array(pil_padded_img))
-        np.testing.assert_almost_equal(np_padded_img,
-                                       tensor_padded_img.numpy().transpose(
-                                           (1, 2, 0)))
+        np.testing.assert_almost_equal(
+            np_padded_img,
+            tensor_padded_img.numpy().transpose((1, 2, 0)),
+            decimal=3)
 
         tensor_padded_img = F.pad(tensor_img, 1, padding_mode='reflect')
         tensor_padded_img = F.pad(tensor_img, [1, 2, 1, 2],
@@ -584,9 +586,9 @@ def test_pad(self):
         pil_padded_img = F.pad(pil_p_img, [1, 2], padding_mode='reflect')
 
     def test_resize(self):
-        np_img = (np.zeros([28, 24, 3])).astype('uint8')
+        np_img = (np.zeros([28, 24, 3]) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img)
-        tensor_img = F.to_tensor(pil_img, 'CHW')
+        tensor_img = F.to_tensor(pil_img, 'CHW') * 255
 
         np_reseized_img = F.resize(np_img, 40)
         pil_reseized_img = F.resize(pil_img, 40)
@@ -595,12 +597,14 @@ def test_resize(self):
 
         np.testing.assert_almost_equal(np_reseized_img,
                                        np.array(pil_reseized_img))
-        np.testing.assert_almost_equal(np_reseized_img,
-                                       tensor_reseized_img.numpy().transpose(
-                                           (1, 2, 0)))
-        np.testing.assert_almost_equal(np_reseized_img,
-                                       tensor_reseized_img2.numpy().transpose(
-                                           (1, 2, 0)))
+        np.testing.assert_almost_equal(
+            np_reseized_img,
+            tensor_reseized_img.numpy().transpose((1, 2, 0)),
+            decimal=3)
+        np.testing.assert_almost_equal(
+            np_reseized_img,
+            tensor_reseized_img2.numpy().transpose((1, 2, 0)),
+            decimal=3)
 
         gray_img = (np.zeros([28, 32])).astype('uint8')
         gray_resize_img = F.resize(gray_img, 40)
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 40c9d415e11f1..c23841ea8b802 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -19,18 +19,13 @@
 from .lazy_import import try_import  # noqa: F401
 from .op_version import OpLastCheckpointChecker  # noqa: F401
 from .install_check import run_check  # noqa: F401
-from ..fluid.framework import unique_name  # noqa: F401
+from . import unique_name  # noqa: F401
 from ..fluid.framework import require_version  # noqa: F401
 
 from . import download  # noqa: F401
 from . import image_util  # noqa: F401
 from . import cpp_extension  # noqa: F401
 
-__all__ = [     #noqa
-           'deprecated',
-           'download',
-           'run_check',
-           'unique_name',
-           'require_version',
-           'try_import'
+__all__ = [  #noqa
+    'deprecated', 'run_check', 'require_version', 'try_import'
 ]
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index ea46ea8b39195..104d979ef6785 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -469,10 +469,6 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
         ###########################   -- END --    ###########################
 
         add_compile_flag(extra_compile_args, ['-w'])  # disable warning
-        # Note(Aurelius84): This marco will impact memory layout of `Tensor`.
-        # We align it automatically with pre-installed Paddle.
-        if core.is_compiled_with_mkldnn():
-            add_compile_flag(extra_compile_args, ['-DPADDLE_WITH_MKLDNN'])
 
         if use_cuda:
             extra_link_args.append('-lcudart')
diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py
index ddd1dad9dbdf5..3ad627ddea927 100644
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
@@ -55,7 +55,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 import logging
 logger = logging.getLogger(__name__)
 
-__all__ = []
+__all__ = ['get_weights_path_from_url']
 
 WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
 
@@ -186,7 +186,15 @@ def _download(url, path, md5sum=None):
 
         logger.info("Downloading {} from {}".format(fname, url))
 
-        req = requests.get(url, stream=True)
+        try:
+            req = requests.get(url, stream=True)
+        except Exception as e:  # requests.exceptions.ConnectionError
+            logger.info(
+                "Downloading {} from {} failed {} times with exception {}".
+                format(fname, url, retry_cnt + 1, str(e)))
+            time.sleep(1)
+            continue
+
         if req.status_code != 200:
             raise RuntimeError("Downloading from {} failed with code "
                                "{}!".format(url, req.status_code))
diff --git a/python/paddle/utils/unique_name.py b/python/paddle/utils/unique_name.py
new file mode 100644
index 0000000000000..d0d487c933d76
--- /dev/null
+++ b/python/paddle/utils/unique_name.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fluid.unique_name import generate  # noqa: F401
+from ..fluid.unique_name import switch  # noqa: F401
+from ..fluid.unique_name import guard  # noqa: F401
+
+__all__ = [  #noqa
+    'generate', 'switch', 'guard'
+]
diff --git a/python/paddle/vision/datasets/flowers.py b/python/paddle/vision/datasets/flowers.py
index 448d6efb52bec..65c0b604efd5d 100644
--- a/python/paddle/vision/datasets/flowers.py
+++ b/python/paddle/vision/datasets/flowers.py
@@ -93,62 +93,44 @@ def __init__(self,
                 .format(backend))
         self.backend = backend
 
-        self.flag = MODE_FLAG_MAP[mode.lower()]
+        flag = MODE_FLAG_MAP[mode.lower()]
 
-        self.data_file = data_file
-        if self.data_file is None:
+        if not data_file:
             assert download, "data_file is not set and downloading automatically is disabled"
-            self.data_file = _check_exists_and_download(
+            data_file = _check_exists_and_download(
                 data_file, DATA_URL, DATA_MD5, 'flowers', download)
 
-        self.label_file = label_file
-        if self.label_file is None:
+        if not label_file:
             assert download, "label_file is not set and downloading automatically is disabled"
-            self.label_file = _check_exists_and_download(
+            label_file = _check_exists_and_download(
                 label_file, LABEL_URL, LABEL_MD5, 'flowers', download)
 
-        self.setid_file = setid_file
-        if self.setid_file is None:
+        if not setid_file:
             assert download, "setid_file is not set and downloading automatically is disabled"
-            self.setid_file = _check_exists_and_download(
+            setid_file = _check_exists_and_download(
                 setid_file, SETID_URL, SETID_MD5, 'flowers', download)
 
         self.transform = transform
 
-        # read dataset into memory
-        self._load_anno()
-
-        self.dtype = paddle.get_default_dtype()
-
-    def _load_anno(self):
-        self.name2mem = {}
-        self.data_tar = tarfile.open(self.data_file)
-        for ele in self.data_tar.getmembers():
-            self.name2mem[ele.name] = ele
+        data_tar = tarfile.open(data_file)
+        self.data_path = data_file.replace(".tgz", "/")
+        if not os.path.exists(self.data_path):
+            os.mkdir(self.data_path)
+        data_tar.extractall(self.data_path)
 
         scio = try_import('scipy.io')
-
-        # double check data download
-        self.label_file = _check_exists_and_download(self.label_file, LABEL_URL,
-                                                     LABEL_MD5, 'flowers', True)
-
-        self.setid_file = _check_exists_and_download(self.setid_file, SETID_URL,
-                                                     SETID_MD5, 'flowers', True)
-
-        self.labels = scio.loadmat(self.label_file)['labels'][0]
-        self.indexes = scio.loadmat(self.setid_file)[self.flag][0]
+        self.labels = scio.loadmat(label_file)['labels'][0]
+        self.indexes = scio.loadmat(setid_file)[flag][0]
 
     def __getitem__(self, idx):
         index = self.indexes[idx]
         label = np.array([self.labels[index - 1]])
         img_name = "jpg/image_%05d.jpg" % index
-        img_ele = self.name2mem[img_name]
-        image = self.data_tar.extractfile(img_ele).read()
-
+        image = os.path.join(self.data_path, img_name)
         if self.backend == 'pil':
-            image = Image.open(io.BytesIO(image))
+            image = Image.open(image)
         elif self.backend == 'cv2':
-            image = np.array(Image.open(io.BytesIO(image)))
+            image = np.array(Image.open(image))
 
         if self.transform is not None:
             image = self.transform(image)
@@ -156,7 +138,7 @@ def __getitem__(self, idx):
         if self.backend == 'pil':
             return image, label.astype('int64')
 
-        return image.astype(self.dtype), label.astype('int64')
+        return image.astype(paddle.get_default_dtype()), label.astype('int64')
 
     def __len__(self):
         return len(self.indexes)
diff --git a/scripts/paddle b/scripts/paddle
deleted file mode 100644
index 5f256ccf15791..0000000000000
--- a/scripts/paddle
+++ /dev/null
@@ -1,169 +0,0 @@
-#!/bin/bash
-
-function version(){
-        echo "PaddlePaddle , compiled with"
-        echo "    with_avx: ON"
-        echo "    with_gpu: OFF"
-        echo "    with_mkl: ON"
-        echo "    with_mkldnn: "
-        echo "    with_python: ON"
-}
-
-function ver2num() {
-  set -e
-  # convert version to number.
-  if [ -z "$1" ]; then # empty argument
-    printf "%03d%03d%03d%03d%03d" 0
-  else
-    local VERN=$(echo $1 | sed 's#v##g' | sed 's#\.# #g' \
-        | sed 's#a# 0 #g' | sed 's#b# 1 #g' | sed 's#rc# 2 #g')
-    if [ `echo $VERN | wc -w` -eq 3 ] ; then
-      printf "%03d%03d%03d%03d%03d" $VERN 999 999
-    else
-      printf "%03d%03d%03d%03d%03d" $VERN
-    fi
-  fi
-  set +e
-}
-
-function cpu_config() {
-  # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
-  # only when MKL enabled
-  if [ "ON" == "OFF" ]; then
-    return 0
-  fi
-  platform="`uname -s`"
-  ht=0
-  if [ $platform == "Linux" ]; then
-    ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
-  elif [ $platform == "Darwin" ]; then
-    if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then
-      # HT is OFF
-      ht=1
-    fi
-  else
-    return 0
-  fi
-  if [ $ht -eq 1 ]; then # HT is OFF
-    if [ -z "$KMP_AFFINITY" ]; then
-      export KMP_AFFINITY="granularity=fine,compact,0,0"
-    fi
-    if [ -z "$OMP_DYNAMIC" ]; then
-      export OMP_DYNAMIC="FALSE"
-    fi
-  else # HT is ON
-    if [ -z "$KMP_AFFINITY" ]; then
-      export KMP_AFFINITY="granularity=fine,compact,1,0"
-    fi
-    if [ -z "$OMP_DYNAMIC" ]; then
-      export OMP_DYNAMIC="True"
-    fi
-  fi
-}
-
-function threads_config() {
-  # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
-  # according to trainer_count and total processors
-  # only when MKL enabled
-  # auto set OPENBLAS_NUM_THREADS when do not use MKL
-  platform="`uname -s`"
-  processors=0
-  if [ $platform == "Linux" ]; then
-    processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
-  elif [ $platform == "Darwin" ]; then
-    processors=`sysctl -n hw.logicalcpu`
-  else
-    return 0
-  fi
-  trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
-  if [ -z $trainers ]; then
-    trainers=1
-  fi
-  threads=$((processors / trainers))
-  if [ $threads -eq 0 ]; then
-    threads=1
-  fi
-  if [ "ON" == "ON" ]; then
-    if [ -z "$OMP_NUM_THREADS" ]; then
-      export OMP_NUM_THREADS=$threads
-    fi
-    if [ -z "$MKL_NUM_THREADS" ]; then
-      export MKL_NUM_THREADS=$threads
-    fi
-  else
-    if [ -z "$OPENBLAS_NUM_THREADS" ]; then
-      export OPENBLAS_NUM_THREADS=$threads
-    fi
-    if [ $threads -gt 1 ] && [ -z "$OPENBLAS_MAIN_FREE" ]; then
-      export OPENBLAS_MAIN_FREE=1
-    fi
-  fi
-  
-}
-
-PADDLE_CONF_HOME="$HOME/.config/paddle"
-mkdir -p ${PADDLE_CONF_HOME}
-
-if [ -z "${PADDLE_NO_STAT+x}" ]; then
-    SERVER_VER=`curl -m 5 -X POST --data content="{ \"version\": \"\" }"\
-        -b ${PADDLE_CONF_HOME}/paddle.cookie \
-        -c ${PADDLE_CONF_HOME}/paddle.cookie \
-        http://api.paddlepaddle.org/version 2>/dev/null`
-    if [ $? -eq 0 ] && [ "$(ver2num )" -lt  $(ver2num $SERVER_VER) ]; then
-      echo "Paddle release a new version ${SERVER_VER}, you can get the install package in http://www.paddlepaddle.org"
-    fi
-fi
-
-PADDLE_BIN_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-
-if [ ! -z "${DEBUGGER}" ]; then
-    echo "Using debug command ${DEBUGGER}"
-fi
-
-CUDNN_LIB_PATH=""
-
-if [ ! -z "${CUDNN_LIB_PATH}" ]; then
-    export LD_LIBRARY_PATH=${CUDNN_LIB_PATH}:${LD_LIBRARY_PATH}
-fi
-
-export PYTHONPATH=${PWD}:${PYTHONPATH}
-
-
-# Check python lib installed or not.
-pip --help > /dev/null
-if [ $? -ne 0 ]; then
-    echo "pip should be installed to run paddle."
-    exit 1
-fi
-
-if [ "OFF" == "ON" ]; then
-    PADDLE_NAME="paddlepaddle-gpu"
-else 
-    PADDLE_NAME="paddlepaddle"
-fi
-
-INSTALLED_VERSION=`pip freeze 2>/dev/null | grep "^${PADDLE_NAME}==" | sed 's/.*==//g'`
-
-if [ -z "${INSTALLED_VERSION}" ]; then
-   INSTALLED_VERSION="0.0.0"  # not installed
-fi
-cat <<EOF | python -
-from distutils.version import LooseVersion
-import sys
-if LooseVersion("${INSTALLED_VERSION}") < LooseVersion(""):
-  sys.exit(1)
-else:
-  sys.exit(0)
-EOF
-
-cpu_config
-# echo $KMP_AFFINITY $OMP_DYNAMIC
-
-case "$1" in
-    "version")
-        version
-        ;;
-    *)
-        version
-        ;;
- esac
diff --git a/tools/analysisPyXml.py b/tools/analysisPyXml.py
new file mode 100644
index 0000000000000..db3d6887853f4
--- /dev/null
+++ b/tools/analysisPyXml.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import commands
+from xml.etree import ElementTree
+import re
+import time
+import queue
+import threading
+import os
+import json
+import sys
+
+
+def analysisPyXml(rootPath, ut):
+    xml_path = '%s/build/pytest/%s/python-coverage.xml' % (rootPath, ut)
+    ut_map_file = '%s/build/ut_map/%s/%s.txt' % (rootPath, ut, ut)
+    tree = ElementTree.parse(xml_path)
+    root = tree.getroot()
+    error_files = []
+    pyCov_file = []
+    for clazz in root.findall('packages/package/classes/class'):
+        clazz_filename = clazz.attrib.get('filename')
+        if not clazz_filename.startswith('/paddle'):
+            clazz_filename = '/paddle/%s' % clazz_filename
+        for line in clazz.findall('lines/line'):
+            line_hits = int(line.attrib.get('hits'))
+            if line_hits != 0:
+                line_number = int(line.attrib.get('number'))
+                command = 'sed -n %sp %s' % (line_number, clazz_filename)
+                _code, output = commands.getstatusoutput(command)
+                if _code == 0:
+                    if output.strip().startswith(
+                        ('from', 'import', '__all__', 'def', 'class', '"""',
+                         '@', '\'\'\'', 'logger', '_logger', 'logging', 'r"""',
+                         'pass', 'try', 'except', 'if __name__ == "__main__"'
+                         )) == False:
+                        #print(line_hits, line_number)
+                        pattern = "(.*) = ('*')|(.*) = (\"*\")|(.*) = (\d)|(.*) = (-\d)|(.*) = (None)|(.*) = (True)|(.*) = (False)|(.*) = (URL_PREFIX*)|(.*) = (\[)|(.*) = (\{)|(.*) = (\()"  #a='b'/a="b"/a=0
+                        if re.match(pattern, output.strip()) == None:
+                            pyCov_file.append(clazz_filename)
+                            os.system('echo %s >> %s' %
+                                      (clazz_filename, ut_map_file))
+                            break
+                else:
+                    error_files.append(clazz_filename)
+                    break
+    print("============len(pyCov_file)")
+    print(len(pyCov_file))
+    print("============error")
+    print(error_files)
+
+
+if __name__ == "__main__":
+    rootPath = sys.argv[1]
+    ut = sys.argv[2]
+    analysisPyXml(rootPath, ut)
diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh
index 7301e9954e910..7457bcb268537 100644
--- a/tools/check_added_ut.sh
+++ b/tools/check_added_ut.sh
@@ -35,8 +35,8 @@ elif [[ "$SYSTEM" == "Windows_NT" ]];then
     git remote | grep upstream
     if [ $? != 0 ]; then 
         git remote add upstream https://github.com/PaddlePaddle/Paddle.git
-        git fetch upstream develop
     fi
+    git fetch upstream ${BRANCH}
 fi
 CURBRANCH=`git rev-parse --abbrev-ref HEAD`
 echo $CURBRANCH
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index b1395c28878e3..ef9af288fb0a2 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -52,7 +52,7 @@ API_FILES=("CMakeLists.txt"
            "python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py"
            "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py"
            "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py"
-           "tools/wlist.json"
+           "tools/print_signatures.py"
            "tools/sampcd_processor.py"
            "paddle/scripts/paddle_build.bat"
            "tools/windows/run_unittests.sh"
@@ -80,11 +80,10 @@ function add_failed(){
     echo_list="${echo_list[@]}$1"
 }
 
-function run_test_sampcd_processor() {
+function run_tools_test() {
     CUR_PWD=$(pwd)
     cd ${PADDLE_ROOT}/tools
-    python test_sampcd_processor.py
-    python test_print_signatures.py
+    python $1
     cd ${CUR_PWD}
 }
 
@@ -141,12 +140,12 @@ for API_FILE in ${API_FILES[*]}; do
       elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py" ];then
           echo_line="You must have one RD (Shixiaowei02 (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py, which manages the white list of no_grad_set without value in operators. For more information, please refer to[https://github.com/PaddlePaddle/Paddle/wiki/It's-recommend-to-set-no_grad_set-to-be-None].\n"
           check_approval 1 39303645 6836917 43953930
-      elif [ "${API_FILE}" == "tools/wlist.json" ];then
-          echo_line="You must have one TPM (jzhang533) approval for the api whitelist for the tools/wlist.json.\n"
-          check_approval 1 29231
       elif [ "${API_FILE}" == "tools/sampcd_processor.py" ];then
           echo_line="test_sampcd_processor.py will be executed for changed sampcd_processor.py.\n"
-          run_test_sampcd_processor
+          run_tools_test test_sampcd_processor.py
+      elif [ "${API_FILE}" == "tools/print_signatures.py" ];then
+          echo_line="test_print_signatures.py will be executed for changed print_signatures.py.\n"
+          run_tools_test test_print_signatures.py
       elif [ "${API_FILE}" == "python/paddle/distributed/fleet/__init__.py" ]; then
 	      echo_line="You must have (fuyinno4 (Recommend), raindrops2sea) approval for ${API_FILE} changes"
 	      check_approval 1 35824027 38231817
diff --git a/tools/check_op_desc.py b/tools/check_op_desc.py
index 15e410401216c..78abb6f36c606 100644
--- a/tools/check_op_desc.py
+++ b/tools/check_op_desc.py
@@ -17,8 +17,6 @@
 from paddle.utils import OpLastCheckpointChecker
 from paddle.fluid.core import OpUpdateType
 
-SAME = 0
-
 INPUTS = "Inputs"
 OUTPUTS = "Outputs"
 ATTRS = "Attrs"
@@ -71,7 +69,7 @@ def diff_vars(origin_vars, new_vars):
     vars_name_only_in_new = set(new_vars.keys()) - set(origin_vars.keys())
 
     for var_name in common_vars_name:
-        if cmp(origin_vars.get(var_name), new_vars.get(var_name)) == SAME:
+        if origin_vars.get(var_name) == new_vars.get(var_name):
             continue
         else:
             error, var_error = True, True
@@ -120,7 +118,7 @@ def diff_attr(ori_attrs, new_attrs):
     attrs_only_in_new = set(new_attrs.keys()) - set(ori_attrs.keys())
 
     for attr_name in common_attrs:
-        if cmp(ori_attrs.get(attr_name), new_attrs.get(attr_name)) == SAME:
+        if ori_attrs.get(attr_name) == new_attrs.get(attr_name):
             continue
         else:
             error, attr_error = True, True
@@ -184,7 +182,7 @@ def compare_op_desc(origin_op_desc, new_op_desc):
     new = json.loads(new_op_desc)
     desc_error_message = {}
     version_error_message = {}
-    if cmp(origin_op_desc, new_op_desc) == SAME:
+    if origin_op_desc == new_op_desc:
         return desc_error_message, version_error_message
 
     for op_type in origin:
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 001f380049f92..470242da34ddd 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -130,7 +130,10 @@ def get_pr_files(self):
             if not files:
                 break
             for f in files:
-                file_list.append(PADDLE_ROOT + f.filename)
+                if f.status == 'removed':
+                    file_list.append('removed')
+                else:
+                    file_list.append(PADDLE_ROOT + f.filename)
             page += 1
         return file_list
 
@@ -228,6 +231,15 @@ def is_only_comment(self, f):
         print('PREC {} is only comment'.format(f))
         return True
 
+    def get_all_count(self):
+        os.system(
+            "cd %s/build && ctest -N|grep 'Total Tests:' | awk -F ': ' '{print $2}' > testCount"
+            % PADDLE_ROOT)
+        f = open("%s/build/testCount" % PADDLE_ROOT)
+        testCount = f.read()
+        f.close()
+        return int(testCount.strip())
+
     def get_pr_ut(self):
         """ Get unit tests in pull request. """
         if self.full_case:
@@ -236,77 +248,89 @@ def get_pr_ut(self):
         ut_list = []
         file_ut_map = None
         ret = self.__urlretrieve(
-            'https://sys-p0.bj.bcebos.com/prec/file_ut.json{}'.format(
-                self.suffix), 'file_ut.json{}'.format(self.suffix))
+            'https://paddle-docker-tar.bj.bcebos.com/pre_test/ut_file_map.json',
+            'ut_file_map.json')
         if not ret:
             print('PREC download file_ut.json failed')
             exit(1)
-        with open('file_ut.json' + self.suffix) as jsonfile:
+        with open('ut_file_map.json') as jsonfile:
             file_ut_map = json.load(jsonfile)
-        for f in self.get_pr_files():
-            current_system = platform.system()
-            if current_system == "Darwin" or current_system == "Windows":
-                f_judge = f.replace(PADDLE_ROOT, '/paddle/', 1)
-                f_judge = f_judge.replace('//', '/')
-            else:
-                f_judge = f
-            if f_judge not in file_ut_map:
-                if f.endswith('.md'):
-                    ut_list.append('md_placeholder')
-                elif f.endswith('.h') or f.endswith('.cu'):
-                    if self.is_only_comment(f):
-                        ut_list.append('h_cu_comment_placeholder')
-                    else:
-                        print(
-                            'PREC dismatch: {} not in file ut map and not md or comment'.
-                            format(f))
-                        return ''
-                elif f.endswith('.cc') or f.endswith('.py') or f.endswith(
-                        '.cu'):
-                    if f.find('test_') != -1 or f.find('_test') != -1:
-                        print('PREC {} need check new ut'.format(f))
-                        check_added_ut = True
-                    elif self.is_only_comment(f):
-                        ut_list.append('nomap_comment_placeholder')
-                    else:
-                        print(
-                            'PREC dismatch: {} not in file ut map and not new ut or comment'.
-                            format(f))
-                        return ''
+
+        current_system = platform.system()
+        notHitMapFiles = []
+        hitMapFiles = []
+        onlyCommentsFilesOrXpu = []
+        file_list = self.get_pr_files()
+        if 'removed' in file_list:
+            print("ipipe_log_param_PRECISION_TEST: false")
+            print("notHitMapFiles: [rm file]")
+            return ''
+        else:
+            for f in file_list:
+                if current_system == "Darwin" or current_system == "Windows" or self.suffix == ".py3":
+                    f_judge = f.replace(PADDLE_ROOT, '/paddle/', 1)
+                    f_judge = f_judge.replace('//', '/')
                 else:
-                    print('PREC dismatch: {} not in file ut map'.format(f))
-                    return ''
-            else:
-                if self.is_only_comment(f):
-                    ut_list.append('map_comment_placeholder')
+                    f_judge = f
+                if f_judge not in file_ut_map:
+                    if f_judge.endswith('.md'):
+                        ut_list.append('md_placeholder')
+                        onlyCommentsFilesOrXpu.append(f_judge)
+                    elif 'tests/unittests/xpu' in f_judge or 'tests/unittests/npu' in f_judge:
+                        ut_list.append('xpu_npu_placeholder')
+                        onlyCommentsFilesOrXpu.append(f_judge)
+                    elif f_judge.endswith(('.h', '.cu', '.cc', 'py')):
+                        if f_judge.find('test_') != -1 or f_judge.find(
+                                '_test') != -1:
+                            check_added_ut = True
+                        if self.is_only_comment(f):
+                            ut_list.append('comment_placeholder')
+                            onlyCommentsFilesOrXpu.append(f_judge)
+                        else:
+                            notHitMapFiles.append(f_judge)
+                    else:
+                        notHitMapFiles.append(f_judge)
                 else:
-                    ut_list.extend(file_ut_map.get(f_judge))
-        ut_list = list(set(ut_list))
-
-        if check_added_ut:
-            with open('{}/added_ut'.format(PADDLE_ROOT)) as utfile:
-                for ut in utfile:
-                    print('PREC NEW UT: {}'.format(ut.rstrip('\r\n')))
-                    ut_list.append(ut.rstrip('\r\n'))
-
-        if ut_list:
-            ret = self.__urlretrieve(
-                'https://sys-p0.bj.bcebos.com/prec/prec_delta{}'.format(
-                    self.suffix), 'prec_delta{}'.format(self.suffix))
-            if ret:
-                with open('prec_delta' + self.suffix) as delta:
-                    for ut in delta:
-                        ut_list.append(ut.rstrip('\r\n'))
+                    if self.is_only_comment(f):
+                        ut_list.append('comment_placeholder')
+                        onlyCommentsFilesOrXpu.append(f_judge)
+                    else:
+                        hitMapFiles.append(f_judge)
+                        ut_list.extend(file_ut_map.get(f_judge))
+            ut_list = list(set(ut_list))
+            if len(notHitMapFiles) != 0:
+                print("ipipe_log_param_PRECISION_TEST: false")
+                print("notHitMapFiles: %s" % notHitMapFiles)
+                return ''
             else:
-                print('PREC download prec_delta failed')
-                exit(1)
-
-        return '\n'.join(ut_list)
+                if check_added_ut:
+                    with open('{}/added_ut'.format(PADDLE_ROOT)) as utfile:
+                        for ut in utfile:
+                            ut_list.append(ut.rstrip('\r\n'))
+                if ut_list:
+                    ret = self.__urlretrieve(
+                        'https://paddle-docker-tar.bj.bcebos.com/pre_test/prec_delta',
+                        'prec_delta')
+                    if ret:
+                        with open('prec_delta') as delta:
+                            for ut in delta:
+                                ut_list.append(ut.rstrip('\r\n'))
+                    else:
+                        print('PREC download prec_delta failed')
+                        exit(1)
+                    print("ipipe_log_param_PRECISION_TEST: true")
+                    print("ipipe_log_param_PRECISION_TEST_Cases_count: %s" %
+                          len(ut_list))
+                    PRECISION_TEST_Cases_ratio = format(
+                        float(len(ut_list)) / float(self.get_all_count()),
+                        '.2f')
+                    print("ipipe_log_param_PRECISION_TEST_Cases_ratio: %s" %
+                          PRECISION_TEST_Cases_ratio)
+                return '\n'.join(ut_list)
 
 
 if __name__ == '__main__':
     pr_checker = PRChecker()
     pr_checker.init()
-    #print(pr_checker.get_pr_ut())
     with open('ut_list', 'w') as f:
         f.write(pr_checker.get_pr_ut())
diff --git a/tools/get_single_test_cov.py b/tools/get_single_test_cov.py
new file mode 100644
index 0000000000000..42940386ca077
--- /dev/null
+++ b/tools/get_single_test_cov.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import time
+import sys
+import re
+
+
+def getFNDAFile(rootPath, test):
+    filename = '%s/build/ut_map/%s/coverage.info.tmp' % (rootPath, test)
+    fn_filename = '%s/build/ut_map/%s/fnda.tmp' % (rootPath, test)
+    os.system('touch %s' % fn_filename)
+    f = open(filename)
+    lines = f.readlines()
+    for line in lines:
+        line = line.replace('\n', '')
+        if line.startswith(('SF:')):
+            os.system('echo %s >> %s' % (line, fn_filename))
+        elif line.startswith(('FNDA:')):
+            hit = int(line.split('FNDA:')[1].split(',')[0])
+            if hit != 0:
+                os.system('echo %s >> %s' % (line, fn_filename))
+    f.close()
+
+
+def analysisFNDAFile(rootPath, test):
+    ut_map_file = '%s/build/ut_map/%s/%s.txt' % (rootPath, test, test)
+    os.system('touch %s' % ut_map_file)
+    fn_filename = '%s/build/ut_map/%s/fnda.tmp' % (rootPath, test)
+    f = open(fn_filename)
+    data = f.read().split('SF:')
+    for message in data:
+        if 'FNDA:' in message:
+            message_list = message.split('\n')
+            clazz_filename = message_list[0]
+            if not clazz_filename.endswith('.h'):  #filter .h's Analysis
+                for i in range(1, len(message_list) - 1):
+                    fn = message_list[i]
+                    matchObj = re.match(
+                        r'(.*)Maker(.*)|(.*)Touch(.*)Regist(.*)|(.*)Touch(.*)JitKernel(.*)|(.*)converterC2Ev(.*)',
+                        fn, re.I)
+                    if matchObj == None:
+                        os.system('echo %s >> %s' %
+                                  (clazz_filename, ut_map_file))
+                        break
+    f.close()
+
+
+def getCovinfo(rootPath, test):
+    ut_map_path = '%s/build/ut_map/%s' % (rootPath, test)
+    os.system(
+        'cd %s && lcov --capture -d . -o coverage.info --rc lcov_branch_coverage=0 > /dev/null 2>&1'
+        % ut_map_path)
+    os.system(
+        "cd %s && lcov --extract coverage.info '/paddle/paddle/fluid/framework/*' '/paddle/paddle/fluid/imperative/*' '/paddle/paddle/fluid/inference/*' '/paddle/paddle/fluid/memory/*' '/paddle/paddle/fluid/operators/*' '/paddle/paddle/fluid/string/*' '/paddle/paddle/fluid/distributed/*' '/paddle/paddle/fluid/extension/*' '/paddle/paddle/fluid/platform/*' '/paddle/paddle/fluid/pybind/*' -o coverage.info.tmp --rc lcov_branch_coverage=0 > /dev/null 2>&1"
+        % ut_map_path)
+    os.system('rm -rf %s/paddle' % ut_map_path)
+    os.system('rm -rf %s/coverage.info' % ut_map_path)
+    getFNDAFile(rootPath, test)
+    analysisFNDAFile(rootPath, test)
+
+
+if __name__ == "__main__":
+    rootPath = sys.argv[1]
+    case = sys.argv[2]
+    getCovinfo(rootPath, case)
diff --git a/tools/get_ut_file_map.py b/tools/get_ut_file_map.py
new file mode 100644
index 0000000000000..d952a299d490e
--- /dev/null
+++ b/tools/get_ut_file_map.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import re
+import json
+
+
+def get_all_paddle_file(rootPath):
+    """get all file in Paddle repo: paddle/fluild, python"""
+    traverse_files = ['%s/paddle/fluid' % rootPath, '%s/python' % rootPath]
+    all_file_paddle = '%s/build/all_file_paddle' % rootPath
+    all_file_paddle_list = []
+    with open(all_file_paddle, 'w') as f:
+        for filename in traverse_files:
+            g = os.walk(filename)
+            for path, dir_list, file_list in g:
+                for file_name in file_list:
+                    all_file_paddle_list.append(os.path.join(path, file_name))
+    return all_file_paddle_list
+
+
+def get_all_uts(rootPath):
+    all_uts_paddle = '%s/build/all_uts_paddle' % rootPath
+    os.system(
+        'cd %s/build && ctest -N -V | grep -Ei "Test[ \t]+#" | grep -oEi "\w+$" > %s'
+        % (rootPath, all_uts_paddle))
+
+
+def remove_useless_file(rootPath):
+    """remove useless file in ut_file_map.json"""
+    all_file_paddle_list = get_all_paddle_file(rootPath)
+    ut_file_map_new = {}
+    ut_file_map = "%s/build/ut_file_map.json" % rootPath
+    with open(ut_file_map, 'r') as load_f:
+        load_dict = json.load(load_f)
+    for key in load_dict:
+        if key in all_file_paddle_list:
+            ut_file_map_new[key] = load_dict[key]
+
+    with open("%s/build/ut_file_map.json" % rootPath, "w") as f:
+        json.dump(ut_file_map_new, f, indent=4)
+        print("remove_useless_file ut_file_map success!!")
+
+
+def handle_ut_file_map(rootPath):
+    utNotSuccess = ''
+    ut_map_path = "%s/build/ut_map" % rootPath
+    files = os.listdir(ut_map_path)
+    ut_file_map = {}
+    count = 0
+    not_success_file = open("%s/build/prec_delta" % rootPath, 'w')
+    for ut in files:
+        count = count + 1
+        print("ut %s: %s" % (count, ut))
+        coverage_info = '%s/%s/coverage.info.tmp' % (ut_map_path, ut)
+        if os.path.exists(coverage_info):
+            filename = '%s/%s/%s.txt' % (ut_map_path, ut, ut)
+            f = open(filename)
+            lines = f.readlines()
+            for line in lines:
+                line = line.replace('\n', '').strip()
+                if line == '':
+                    continue
+                elif line.startswith('/paddle/build'):
+                    source_file = line.replace('/build', '')
+                    #source_file = re.sub('.pb.*', '.proto', source_file)
+                elif 'precise test map fileeee:' in line:
+                    source_file = line.split('precise test map fileeee:')[
+                        1].strip()
+                else:
+                    source_file = line
+                if source_file not in ut_file_map:
+                    ut_file_map[source_file] = []
+                if ut not in ut_file_map[source_file]:
+                    ut_file_map[source_file].append(ut)
+
+        else:
+            not_success_file.write('%s\n' % ut)
+            utNotSuccess = utNotSuccess + '^%s$|' % ut
+
+    not_success_file.close()
+
+    with open("%s/build/ut_file_map.json" % rootPath, "w") as f:
+        json.dump(ut_file_map, f, indent=4)
+
+    print("utNotSuccess:")
+    print(utNotSuccess)
+
+
+def notsuccessfuc(rootPath):
+    utNotSuccess = ''
+    ut_map_path = "%s/build/ut_map" % rootPath
+    files = os.listdir(ut_map_path)
+    count = 0
+    # ut failed!!
+    for ut in files:
+        coverage_info = '%s/%s/coverage.info.tmp' % (ut_map_path, ut)
+        if os.path.exists(coverage_info):
+            pass
+        else:
+            count = count + 1
+            utNotSuccess = utNotSuccess + '^%s$|' % ut
+
+    # ut not exec
+    get_all_uts(rootPath)
+    with open("/paddle/build/all_uts_paddle", "r") as f:
+        data = f.readlines()
+    for ut in data:
+        ut = ut.replace('\n', '').strip()
+        if ut not in files:
+            print(ut)
+            count = count + 1
+            utNotSuccess = utNotSuccess + '^%s$|' % ut
+
+    if utNotSuccess != '':
+        print("utNotSuccess count: %s" % count)
+        f = open('%s/build/utNotSuccess' % rootPath, 'w')
+        f.write(utNotSuccess[:-1])
+        f.close()
+
+
+def ut_file_map_supplement(rootPath):
+    ut_file_map_new = "%s/build/ut_file_map.json" % rootPath
+    os.system('mkdir /pre_test')
+    os.system(
+        'cd /pre_test && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/pre_test/ut_file_map.json --no-check-certificate'
+    )
+    ut_file_map_old = "/pre_test/ut_file_map.json"
+    ut_file_map_full = {}
+    with open(ut_file_map_new, 'r') as load_f:
+        load_dict_new = json.load(load_f)
+    with open(ut_file_map_old, 'r') as f:
+        load_dict_old = json.load(f)
+
+    for filename in load_dict_new:
+        ut_file_map_full[filename] = load_dict_new[filename]
+        if filename in load_dict_old:
+            for ut in load_dict_old[filename]:
+                if ut not in ut_file_map_full[filename]:
+                    ut_file_map_full[filename].append(ut)
+
+    for filename in load_dict_old:
+        if filename not in load_dict_new:
+            ut_file_map_full[filename] = load_dict_old[filename]
+
+    with open("/pre_test/ut_file_map.json", "w") as f:
+        json.dump(ut_file_map_full, f, indent=4)
+        print("ut_file_map_full success!!")
+
+    all_uts_paddle = '%s/build/all_uts_paddle' % rootPath
+    with open(all_uts_paddle, 'r') as f:
+        all_uts_paddle_list = f.readlines()
+        f.close()
+    os.system(
+        'cd /pre_test && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/pre_test/prec_delta --no-check-certificate'
+    )
+    prec_delta_old = '/pre_test/prec_delta'
+    prec_delta_new = "%s/build/prec_delta" % rootPath
+    with open(prec_delta_old, 'r') as f:
+        prec_delta_old_list = f.readlines()
+        f.close()
+    with open(prec_delta_new, 'r') as f:
+        prec_delta_new_list = f.readlines()
+        f.close()
+    for ut in prec_delta_old_list:
+        if ut not in prec_delta_new_list and ut not in all_uts_paddle_list:
+            prec_delta_new_list.append(ut)
+    prec_delta_file = open("/pre_test/prec_delta", 'w')
+    for ut in prec_delta_new_list:
+        prec_delta_file.write(ut)
+    prec_delta_file.close()
+
+
+if __name__ == "__main__":
+    func = sys.argv[1]
+    if func == 'get_not_success_ut':
+        rootPath = sys.argv[2]
+        notsuccessfuc(rootPath)
+    elif func == 'get_ut_map':
+        rootPath = sys.argv[2]
+        handle_ut_file_map(rootPath)
+        remove_useless_file(rootPath)
+        ut_file_map_supplement(rootPath)
diff --git a/tools/handle_h_cu_file.py b/tools/handle_h_cu_file.py
new file mode 100644
index 0000000000000..7c300d96c8461
--- /dev/null
+++ b/tools/handle_h_cu_file.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import queue
+import threading
+import os
+import json
+import time
+import sys
+
+taskQueue = queue.Queue()
+
+
+def worker(fun):
+    while True:
+        temp = taskQueue.get()
+        fun(temp)
+        taskQueue.task_done()
+
+
+def threadPool(threadPoolNum):
+    threadPool = []
+    for i in range(threadPoolNum):
+        thread = threading.Thread(target=worker, args={doFun, })
+        thread.daemon = True
+        threadPool.append(thread)
+    return threadPool
+
+
+def get_h_file_md5(rootPath):
+    h_cu_files = '%s/tools/h_cu_files.log' % rootPath
+    f = open(h_cu_files)
+    lines = f.readlines()
+    for line in lines:
+        line = line.strip()
+        os.system('md5sum %s >> %s/tools/h_cu_md5.log' % (line, rootPath))
+
+
+def insert_pile_to_h_file(rootPath):
+    h_cu_files = '%s/tools/h_cu_files.log' % rootPath
+    f = open(h_cu_files)
+    lines = f.readlines()
+    for line in lines:
+        line = line.strip()
+        func = line.replace('/', '_').replace('.', '_')
+        os.system('echo "\n#ifndef _PRECISE%s_\n" >> %s' % (func.upper(), line))
+        os.system('echo "#define _PRECISE%s_" >> %s' % (func.upper(), line))
+        os.system('echo "\n#include <cstdio>\n" >> %s' % line)
+        os.system(
+            'echo "__attribute__((constructor)) static void calledFirst%s()\n{" >> %s'
+            % (func, line))
+        os.system(
+            'echo \'    printf("precise test map fileeee: %%s\\\\n", __FILE__);\n}\' >> %s'
+            % line)
+        os.system('echo "\n#endif" >> %s' % line)
+
+
+def get_h_cu_file(file_path):
+    rootPath = file_path[0]
+    dir_path = file_path[1]
+    filename = file_path[2]
+    ut = filename.replace('^', '').replace('$', '').replace('.log', '')
+    os.system(
+        "cat %s/%s | grep 'precise test map fileeee:'| uniq >> %s/build/ut_map/%s/%s.txt"
+        % (dir_path, filename, rootPath, ut, ut))
+
+
+def doFun(file_path):
+    get_h_cu_file(file_path)
+
+
+def main(rootPath, dir_path):
+    """
+    get useful message
+    """
+    startTime = int(time.time())
+    test_h_cu_dict = {}
+    pool = threadPool(23)
+    for i in range(pool.__len__()):
+        pool[i].start()
+    files = os.listdir(dir_path)
+    for filename in files:
+        file_path = [rootPath, dir_path, filename]
+        taskQueue.put(file_path)
+    taskQueue.join()
+    endTime = int(time.time())
+    print('analy h/cu file cost Time: %s' % (endTime - startTime))
+
+
+if __name__ == "__main__":
+    func = sys.argv[1]
+    if func == 'get_h_file_md5':
+        rootPath = sys.argv[2]
+        get_h_file_md5(rootPath)
+    elif func == 'insert_pile_to_h_file':
+        rootPath = sys.argv[2]
+        insert_pile_to_h_file(rootPath)
+    elif func == 'analy_h_cu_file':
+        dir_path = sys.argv[2]
+        rootPath = sys.argv[3]
+        main(rootPath, dir_path)
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 9d03ae22de28f..cb0581d671006 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -131,7 +131,6 @@
     'test_ones_op',
     'test_npair_loss_op',
     'test_nn_functional_embedding_static',
-    'test_nce',
     'test_name_scope',
     'test_naive_executor',
     'test_multiprocess_dataloader_iterable_dataset_split',
@@ -285,21 +284,15 @@
     'test_default_scope_funcs',
     'test_default_dtype',
     'test_debugger',
-    'test_dataset_wmt',
     'test_dataset_voc',
     'test_dataset_uci_housing',
-    'test_dataset_movielens',
     'test_dataset_imikolov',
     'test_dataset_imdb',
     'test_dataset_conll05',
-    'test_dataset_cifar',
-    'test_dataloader_unkeep_order',
-    'test_dataloader_keep_order',
     'test_dataloader_dataset',
     'test_data_generator',
     'test_data_feeder',
     'test_data',
-    'test_cyclic_cifar_dataset',
     'test_cudnn_placement_pass',
     'test_crypto',
     'test_crf_decoding_op',
@@ -338,7 +331,6 @@
     'test_broadcast_to_op',
     'test_broadcast_shape',
     'test_broadcast_error',
-    'test_broadcast',
     'test_bpr_loss_op',
     'test_boxps',
     'test_bipartite_match_op',
@@ -349,8 +341,6 @@
     'test_basic_rnn_name',
     'test_attention_lstm_op',
     'test_analyzer',
-    'test_allreduce',
-    'test_allgather',
     'test_aligned_allocator',
     'system_allocator_test',
     'stringprintf_test',
@@ -431,6 +421,7 @@
     'buffered_allocator_test',
     'broadcast_op_test',
     'bfloat16_test',
+    'complex_test',
     'beam_search_decode_op_test',
     'auto_growth_best_fit_allocator_test',
     'assign_op_test',
@@ -497,7 +488,6 @@
     'test_dist_mnist_ring_allreduce',
     'test_fleet_launch_async',
     'test_dist_fleet_a_sync_optimizer_geo',
-    'test_parallel_dygraph_control_flow',
     'test_auto_checkpoint',
     'test_fleet_pipeline_meta_optimizer',
     'test_dist_fleet_heter_ctr',
@@ -519,12 +509,10 @@
     'test_dist_fleet_ps2',
     'test_dist_fleet_grad_clip',
     'test_custom_concat',
-    'test_analyzer_transformer_fuse',
     'test_analyzer_seq_pool1_fuse_statis',
     'test_fc_lstm_fuse_pass_cc',
     'test_layer_norm_fuse_pass',
     'test_fc_gru_fuse_pass_cc',
-    'test_analyzer_save_model',
     'test_fleet_ps',
     'test_analyzer_multi_model_prediction',
     'test_fleet_base_3',
@@ -548,7 +536,6 @@
     'test_bf16_utils',
     'test_analyzer_seq_pool1_compare_determine',
     'test_avoid_twice_initialization',
-    'test_callback_early_stop',
     'test_fleet_distributed_strategy',
     'test_launch_coverage',
     'test_sgd_op_bf16',
@@ -571,8 +558,6 @@
     'test_fleet_cc',
     'test_repeated_fc_relu_fuse_pass_cc',
     'heter_server_test',
-    'test_static_save_load_large',
-    'graph_node_test',
     'test_custom_conj',
     'test_fleet_private_function',
     'test_fake_init_op',
@@ -604,27 +589,21 @@
 # It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
 # just remove it from this list.
 TETRAD_PARALLEL_JOB = [
+    'graph_node_test',
+    'test_assert',
+    'test_nce',
     'buffered_allocator_test',
     'allocator_facade_frac_flags_test',
     'cuda_helper_test',
-    'sequence_padding_test',
     'test_auto_growth_gpu_memory_limit',
-    'test_imperative_framework',
     'device_context_test',
     'test_reference_count_pass_last_lived_ops',
     'copy_same_tensor_test',
-    'float16_gpu_test',
-    'test_leaky_relu_grad_grad_functor',
-    'sequence_pooling_test',
     'mixed_vector_test',
     'op_registry_test',
-    'strided_memcpy_test',
-    'selected_rows_functor_gpu_test',
     'test_prepare_op',
     'data_device_transform_test',
-    'test_tensor_to_numpy',
     'test_naive_best_fit_gpu_memory_limit',
-    'vol2col_test',
     'test_imperative_using_non_zero_gpu',
     'retry_allocator_test',
     'system_allocator_test',
@@ -643,80 +622,69 @@
     'test_analyzer_seq_pool1',
     'test_analyzer_ocr',
     'test_analyzer_seq_conv1',
-    'test_analyzer_small_dam',
     'test_analyzer_mobilenet_depthwise_conv',
     'test_analyzer_pyramid_dnn',
-    'test_analyzer_text_classification',
     'test_analyzer_rnn2',
-    'test_analyzer_transformer',
     'test_analyzer_resnet50',
     'test_analyzer_ner',
-    'test_analyzer_lac',
-    'test_analyzer_transformer_profile',
     'test_analyzer_mobilenet_transpose',
     'test_analyzer_rnn1',
     'test_analyzer_seq_pool1_profile',
     'test_analyzer_paddletensor_tensor',
     'test_analyzer_bert',
     'test_analyzer_googlenet',
-    'zero_copy_tensor_test',
-    'custom_tensor_test',
     'test_fleet_base',
     'test_imperative_container_layerdict',
-    'test_complex_simplenet',
-    'test_tensor_register_hook',
     'test_set_value_op',
-    'test_tensor_type_promotion',
     'test_view_op_reuse_allocation',
-    'test_complex_grad_accumulated',
     'test_sequential',
     'test_sequential',
     'test_imperative_layers',
     'test_dgc_momentum_op',
     'test_memcpy_op',
     'test_dgc_op',
-    'test_modelaverage',
     'test_lookahead',
     'test_callback_visualdl',
     'test_new_group_api',
     'test_collective_split_embedding_none_divisible',
     'test_collective_wait',
     'test_collective_split_row_linear',
-    'test_collective_split_col_linear',
     'test_collective_split_embedding',
+    'test_custom_attrs_jit',
+    'float16_gpu_test',
+    'test_leaky_relu_grad_grad_functor',
+    'test_complex_simplenet',
+    'selected_rows_functor_gpu_test',
+    'test_imperative_framework',
 ]
 
 # It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
 # just remove it from this list.
 TWO_PARALLEL_JOB = [
+    'test_tensor_to_numpy',
+    'zero_copy_tensor_test',
+    'sequence_pooling_test',
+    'sequence_padding_test',
+    'vol2col_test',
     'convert_model2dot_ernie',
     'im2col_test',
-    'test_elementwise_add_grad_grad',
     'test_logical_op',
     'test_imperative_mnist',
     'test_imperative_deepcf',
     'test_cholesky_op',
-    'test_multiprocess_dataloader_iterable_dataset_static',
     'test_sample_logits_op',
     'test_ir_fc_fuse_pass',
-    'test_imperative_qat_channelwise',
     'test_fleet_base_single',
-    'test_imperative_out_scale',
     'test_multiprocess_dataloader_iterable_dataset_dynamic',
     'test_fill_op',
     'test_slice_op',
     'test_cond',
-    'test_compiled_program',
     'test_lstm',
     'test_ema',
-    'test_py_reader_using_executor',
     'test_nan_inf',
     'test_isinstance',
-    'test_jit_save_load',
     'test_box_clip_op',
-    'test_group_norm_op',
     'test_seed_op',
-    'test_activation_nn_grad',
     'test_pool2d_int8_mkldnn_op',
     'test_adagrad_op_v2',
     'test_nn_functional_hot_op',
@@ -747,15 +715,12 @@
     'test_lod_reset_op',
     'test_install_check',
     'test_anchor_generator_op',
-    'test_imperative_ptb_rnn',
     'test_gather_nd_op',
-    'test_flatten_contiguous_range_op',
     'test_network_with_dtype',
     'test_elementwise_sub_op',
     'test_assert_op',
     'test_elementwise_div_op',
     'test_gather_tree_op',
-    'test_decoupled_py_reader',
     'test_imperative_named_members',
     'test_seqconv_eltadd_relu_fuse_pass',
     'test_analysis_predictor',
@@ -771,7 +736,6 @@
     'test_traced_layer_err_msg',
     'test_unique_with_counts',
     'test_auc_single_pred_op',
-    'test_stack_op',
     'test_conv_bn_fuse_pass',
     'test_instance_norm_op_v2',
     'test_softmax_bf16_mkldnn_op',
@@ -793,10 +757,8 @@
     'test_ctc_align',
     'test_imperative_save_load_v2',
     'test_decayed_adagrad_op',
-    'test_generator_dataloader',
     'test_dropout_op',
     'test_functional_conv3d',
-    'test_executor_return_tensor_not_overwriting',
     'test_flatten2_op',
     'test_fsp_op',
     'test_fusion_transpose_flatten_concat_op',
@@ -812,7 +774,6 @@
     'test_temporal_shift_op',
     'test_case',
     'test_transformer_api',
-    'test_bmm_op',
     'test_adagrad_op',
     'test_batch_norm_mkldnn_op',
     'test_adam_op_multi_thread',
@@ -842,7 +803,6 @@
     'test_sequence_expand_as',
     'test_cos_sim_op',
     'test_sequence_enumerate_op',
-    'test_cross_entropy2_op',
     'test_sequence_concat',
     'test_cudnn_lstmcell',
     'test_data_norm_op',
@@ -947,7 +907,6 @@
     'test_crop_tensor_op',
     'test_sequence_expand',
     'test_sequence_mask',
-    'test_conv_nn_grad',
     'test_sequence_pool',
     'test_conv_elementwise_add2_act_fuse_pass',
     'test_sequence_reshape',
@@ -973,9 +932,7 @@
     'test_auc_op',
     'test_adam_op',
     'test_bilinear_tensor_product_op',
-    'test_break_continue',
     'test_transpose_mkldnn_op',
-    'test_callback_reduce_lr_on_plateau',
     'test_cast_op',
     'test_scatter_nd_op',
     'test_conv2d_transpose_op_depthwise_conv',
@@ -990,7 +947,6 @@
     'test_functional_conv2d_transpose',
     'test_functional_conv3d_transpose',
     'test_dot_op',
-    'test_gru_op',
     'test_device',
     'test_imperative_layer_apply',
     'test_dataloader_early_reset',
@@ -1064,26 +1020,21 @@
     'test_imperative_optimizer',
     'test_assign_value_op',
     'test_roi_pool_op',
-    'test_imperative_basic',
     'test_word2vec',
     'test_manual_seed',
-    'test_buffer_shared_memory_reuse_pass',
     'test_range',
     'test_box_decoder_and_assign_op',
     'test_imperative_optimizer_v2',
     'test_python_operator_overriding',
     'test_is_empty_op',
-    'test_imperative_qat',
     'test_py_reader_pin_memory',
     'test_train_recognize_digits',
     'test_parallel_executor_feed_persistable_var',
     'test_mnist',
     'test_update_loss_scaling_op',
     'test_rnn_cell_api',
-    'test_parallel_executor_fetch_isolated_var',
     'test_imperative_load_static_param',
     'test_fuse_bn_add_act_pass',
-    'test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass',
     'test_quantize_transpiler_v2',
     'paddle_infer_api_test',
     'test_analyzer_ernie',
diff --git a/tools/pyCov_multithreading.py b/tools/pyCov_multithreading.py
new file mode 100644
index 0000000000000..2df4ac2ef6b3f
--- /dev/null
+++ b/tools/pyCov_multithreading.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import commands
+from xml.etree import ElementTree
+import re
+import time
+import queue
+import threading
+import os
+import json
+import sys
+
+taskQueue = queue.Queue()
+lock = threading.RLock()
+
+
+def worker(fun):
+    while True:
+        temp = taskQueue.get()
+        fun(temp)
+        taskQueue.task_done()
+
+
+def threadPool(threadPoolNum):
+    threadPool = []
+    for i in range(threadPoolNum):
+        thread = threading.Thread(target=worker, args={doFun, })
+        thread.daemon = True
+        threadPool.append(thread)
+    return threadPool
+
+
+def getPyCovResult(params):
+    rootPath = params[0]
+    ut = params[1]
+    print("ut: %s" % ut)
+    startTime = int(time.time())
+    path = '%s/build/pytest/%s' % (rootPath, ut)
+    os.system('cd %s && coverage combine `ls python-coverage.data.*`' % path)
+    os.system('cd %s && pwd && coverage xml -i -o python-coverage.xml' % path)
+    xml_path = '%s/python-coverage.xml' % path
+    os.system("python %s/tools/analysisPyXml.py %s %s" %
+              (rootPath, rootPath, ut))
+    endTime = int(time.time())
+    print('pyCov Time: %s' % (endTime - startTime))
+
+
+def doFun(params):
+    getPyCovResult(params)
+
+
+def main(rootPath):
+    """
+    1. get gcov file
+    2. get gcov file not coverageratio = 0
+    """
+    path = '%s/build/pytest' % rootPath
+    dirs = os.listdir(path)
+    pool = threadPool(23)
+    for i in range(pool.__len__()):
+        pool[i].start()
+    for ut in dirs:
+        params = [rootPath, ut]
+        taskQueue.put(params)
+    taskQueue.join()
+
+
+if __name__ == "__main__":
+    rootPath = sys.argv[1]
+    main(rootPath)
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 52777cd59ba25..a1658e3c2edf7 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -11,12 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+please make sure to run in the tools path
+usage: python sample_test.py {cpu or gpu} 
+    {cpu or gpu}: running in cpu version or gpu version
+
+for example, you can run cpu version python2 testing like this:
+
+    python sampcd_processor.py cpu 
 
+"""
 import os
 import sys
 import subprocess
 import multiprocessing
-import math
 import platform
 import inspect
 import json
@@ -24,16 +32,7 @@
 import shutil
 import re
 import logging
-"""
-please make sure to run in the tools path
-usage: python sample_test.py {cpu or gpu} 
-    {cpu or gpu}: running in cpu version or gpu version
-
-for example, you can run cpu version python2 testing like this:
-
-    python sampcd_processor.py cpu 
-
-"""
+import time
 
 logger = logging.getLogger()
 if logger.handlers:
@@ -45,6 +44,7 @@
 console.setFormatter(logging.Formatter("%(message)s"))
 
 RUN_ON_DEVICE = 'cpu'
+SAMPLE_CODE_TEST_CAPACITY = set()
 GPU_ID = 0
 methods = []
 whl_error = []
@@ -52,6 +52,15 @@
 API_PR_SPEC_FN = 'paddle/fluid/API_PR.spec'
 API_DIFF_SPEC_FN = 'dev_pr_diff_api.spec'
 SAMPLECODE_TEMPDIR = 'samplecode_temp'
+ENV_KEY_CODES_FRONTEND = 'CODES_INSERTED_INTO_FRONTEND'
+ENV_KEY_TEST_CAPACITY = 'SAMPLE_CODE_TEST_CAPACITY'
+SUMMARY_INFO = {
+    'success': [],
+    'failed': [],
+    'skiptest': [],
+    'nocodes': [],
+    # ... required not-match
+}
 
 
 def find_all(srcstr, substr):
@@ -75,32 +84,225 @@ def find_all(srcstr, substr):
     return indices
 
 
-def check_indent(cdline):
+def find_last_future_line_end(cbstr):
+    """
+    find the last `__future__` line.
+
+    Args:
+        docstr(str): docstring
+    Return:
+        index of the line end or None.
     """
-    to check the indent of a given code line
+    pat = re.compile('__future__.*\n')
+    lastmo = None
+    it = re.finditer(pat, cbstr)
+    while True:
+        try:
+            lastmo = next(it)
+        except StopIteration:
+            break
+    if lastmo:
+        return lastmo.end()
+    else:
+        return None
 
-    to get the number of starting blank chars,
-    e.t. blankspaces and \t
 
-    \t will be interpreted as 4 single blankspaces,
-    e.t. '\t'='    '
+def extract_code_blocks_from_docstr(docstr):
+    """
+    extract code-blocks from the given docstring.
+
+    DON'T include the multiline-string definition in code-blocks.
+    The *Examples* section must be the last.
 
     Args:
-        cdline(str) : a single line of code from the source file
+        docstr(str): docstring
+    Return:
+        code_blocks: A list of code-blocks, indent removed. 
+                     element {'name': the code-block's name, 'id': sequence id.
+                              'codes': codes, 'required': 'gpu'}
+    """
+    code_blocks = []
+
+    mo = re.search(r"Examples:", docstr)
+    if mo is None:
+        return code_blocks
+    ds_list = docstr[mo.start():].replace("\t", '    ').split("\n")
+    lastlineindex = len(ds_list) - 1
+
+    cb_start_pat = re.compile(r"code-block::\s*python")
+    cb_param_pat = re.compile(r"^\s*:(\w+):\s*(\S*)\s*$")
+    cb_required_pat = re.compile(r"^\s*#\s*require[s|d]\s*:\s*(\S+)\s*$")
+
+    cb_info = {}
+    cb_info['cb_started'] = False
+    cb_info['cb_cur'] = []
+    cb_info['cb_cur_indent'] = -1
+    cb_info['cb_cur_name'] = None
+    cb_info['cb_cur_seq_id'] = 0
+    cb_info['cb_required'] = None
+
+    def _cb_started():
+        # nonlocal cb_started, cb_cur_name, cb_required, cb_cur_seq_id
+        cb_info['cb_started'] = True
+        cb_info['cb_cur_seq_id'] += 1
+        cb_info['cb_cur_name'] = None
+        cb_info['cb_required'] = None
+
+    def _append_code_block():
+        # nonlocal code_blocks, cb_cur, cb_cur_name, cb_cur_seq_id, cb_required
+        code_blocks.append({
+            'codes': inspect.cleandoc("\n".join(cb_info['cb_cur'])),
+            'name': cb_info['cb_cur_name'],
+            'id': cb_info['cb_cur_seq_id'],
+            'required': cb_info['cb_required'],
+        })
+
+    for lineno, linecont in enumerate(ds_list):
+        if re.search(cb_start_pat, linecont):
+            if not cb_info['cb_started']:
+                _cb_started()
+                continue
+            else:
+                # cur block end
+                if len(cb_info['cb_cur']):
+                    _append_code_block()
+                _cb_started()  # another block started
+                cb_info['cb_cur_indent'] = -1
+                cb_info['cb_cur'] = []
+        else:
+            if cb_info['cb_started']:
+                # handle the code-block directive's options
+                mo_p = cb_param_pat.match(linecont)
+                if mo_p:
+                    if mo_p.group(1) == 'name':
+                        cb_info['cb_cur_name'] = mo_p.group(2)
+                    continue
+                # read the required directive
+                mo_r = cb_required_pat.match(linecont)
+                if mo_r:
+                    cb_info['cb_required'] = mo_r.group(1)
+                # docstring end
+                if lineno == lastlineindex:
+                    mo = re.search(r"\S", linecont)
+                    if mo is not None and cb_info['cb_cur_indent'] <= mo.start(
+                    ):
+                        cb_info['cb_cur'].append(linecont)
+                    if len(cb_info['cb_cur']):
+                        _append_code_block()
+                    break
+                # check indent for cur block start and end.
+                mo = re.search(r"\S", linecont)
+                if mo is None:
+                    continue
+                if cb_info['cb_cur_indent'] < 0:
+                    # find the first non empty line
+                    cb_info['cb_cur_indent'] = mo.start()
+                    cb_info['cb_cur'].append(linecont)
+                else:
+                    if cb_info['cb_cur_indent'] <= mo.start():
+                        cb_info['cb_cur'].append(linecont)
+                    else:
+                        if linecont[mo.start()] == '#':
+                            continue
+                        else:
+                            # block end
+                            if len(cb_info['cb_cur']):
+                                _append_code_block()
+                            cb_info['cb_started'] = False
+                            cb_info['cb_cur_indent'] = -1
+                            cb_info['cb_cur'] = []
+    return code_blocks
+
+
+def get_test_capacity():
+    """
+    collect capacities and set to SAMPLE_CODE_TEST_CAPACITY
+    """
+    global SAMPLE_CODE_TEST_CAPACITY  # write
+    global ENV_KEY_TEST_CAPACITY, RUN_ON_DEVICE  # readonly
+    if ENV_KEY_TEST_CAPACITY in os.environ:
+        for r in os.environ[ENV_KEY_TEST_CAPACITY].split(','):
+            rr = r.strip().lower()
+            if r:
+                SAMPLE_CODE_TEST_CAPACITY.add(rr)
+    if 'cpu' not in SAMPLE_CODE_TEST_CAPACITY:
+        SAMPLE_CODE_TEST_CAPACITY.add('cpu')
 
-    Returns:
-        int : the indent of the number of interpreted
-             blankspaces
+    if RUN_ON_DEVICE:
+        SAMPLE_CODE_TEST_CAPACITY.add(RUN_ON_DEVICE)
+
+
+def is_required_match(requirestr, cbtitle='not-specified'):
     """
-    indent = 0
-    for c in cdline:
-        if c == '\t':
-            indent += 4
-        elif c == ' ':
-            indent += 1
-        if c != ' ' and c != '\t':
-            break
-    return indent
+    search the required instruction in the code-block, and check it match the current running environment.
+    
+    environment values of equipped: cpu, gpu, xpu, distributed, skip
+    the 'skip' is the special flag to skip the test, so is_required_match will return False directly.
+
+    Args:
+        requirestr(str): the required string.
+        cbtitle(str): the title of the code-block.
+    returns:
+        True - yes, matched
+        False - not match
+        None - skipped  # trick
+    """
+    global SAMPLE_CODE_TEST_CAPACITY  # readonly
+    requires = set(['cpu'])
+    if requirestr:
+        for r in requirestr.split(','):
+            rr = r.strip().lower()
+            if rr:
+                requires.add(rr)
+    if 'skip' in requires or 'skiptest' in requires:
+        logger.info('%s: skipped', cbtitle)
+        return None
+
+    if all([
+            k in SAMPLE_CODE_TEST_CAPACITY for k in requires
+            if k not in ['skip', 'skiptest']
+    ]):
+        return True
+
+    logger.info('%s: the equipments [%s] not match the required [%s].', cbtitle,
+                ','.join(SAMPLE_CODE_TEST_CAPACITY), ','.join(requires))
+    return False
+
+
+def insert_codes_into_codeblock(codeblock, apiname='not-specified'):
+    """
+    insert some codes in the frontend and backend into the code-block.
+    """
+    global ENV_KEY_CODES_FRONTEND, GPU_ID, RUN_ON_DEVICE  # readonly
+    inserted_codes_f = ''
+    inserted_codes_b = ''
+    if ENV_KEY_CODES_FRONTEND in os.environ and os.environ[
+            ENV_KEY_CODES_FRONTEND]:
+        inserted_codes_f = os.environ[ENV_KEY_CODES_FRONTEND]
+    else:
+        cpu_str = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = ""\n'
+        gpu_str = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = "{}"\n'.format(
+            GPU_ID)
+        if 'required' in codeblock:
+            if codeblock['required'] is None or codeblock['required'] == 'cpu':
+                inserted_codes_f = cpu_str
+            elif codeblock['required'] == 'gpu':
+                inserted_codes_f = gpu_str
+        else:
+            if RUN_ON_DEVICE == "cpu":
+                inserted_codes_f = cpu_str
+            elif RUN_ON_DEVICE == "gpu":
+                inserted_codes_f = gpu_str
+    inserted_codes_b = '\nprint("{}\'s sample code (name:{}, id:{}) is executed successfully!")'.format(
+        apiname, codeblock['name'], codeblock['id'])
+
+    cb = codeblock['codes']
+    last_future_line_end = find_last_future_line_end(cb)
+    if last_future_line_end:
+        return cb[:last_future_line_end] + inserted_codes_f + cb[
+            last_future_line_end:] + inserted_codes_b
+    else:
+        return inserted_codes_f + cb + inserted_codes_b
 
 
 def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
@@ -117,122 +319,111 @@ def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
     Returns:
         sample_code_filenames(list of str)
     """
-    global GPU_ID, RUN_ON_DEVICE, SAMPLECODE_TEMPDIR
-    CODE_BLOCK_INTERDUCTORY = "code-block:: python"
+    global GPU_ID, RUN_ON_DEVICE, SAMPLECODE_TEMPDIR  # readonly
+    global SUMMARY_INFO  # update
 
-    sampcd_begins = find_all(srccom, CODE_BLOCK_INTERDUCTORY)
-    if len(sampcd_begins) == 0:
+    codeblocks = extract_code_blocks_from_docstr(srccom)
+    if len(codeblocks) == 0:
+        SUMMARY_INFO['nocodes'].append(name)
         # detect sample codes using >>> to format and consider this situation as wrong
-        print(htype, " name:", hname)
-        print("-----------------------")
+        logger.info(htype + " name:" + name)
+        logger.info("-----------------------")
         if srccom.find("Examples:") != -1:
-            print("----example code check----\n")
+            logger.info("----example code check----")
             if srccom.find(">>>") != -1:
-                print(
-                    "Deprecated sample code style:\n\n    Examples:\n\n        >>>codeline\n        >>>codeline\n\n\n ",
-                    "Please use '.. code-block:: python' to ",
-                    "format sample code.\n")
+                logger.warning(r"""Deprecated sample code style:
+    Examples:
+        >>>codeline
+        >>>codeline
+
+Please use '.. code-block:: python' to format the sample code.""")
                 return []
         else:
-            print("Error: No sample code!\n")
+            logger.warning("Error: No sample code!")
             return []
+
     sample_code_filenames = []
-    for y in range(1, len(sampcd_begins) + 1):
-        sampcd_begin = sampcd_begins[y - 1]
-        sampcd = srccom[sampcd_begin + len(CODE_BLOCK_INTERDUCTORY) + 1:]
-        sampcd = sampcd.split("\n")
-        # remove starting empty lines
-        while sampcd[0].replace(' ', '').replace('\t', '') == '':
-            sampcd.pop(0)
-
-        # the minimum indent, which is the indent of the first
-        # non-empty line
-        min_indent = check_indent(sampcd[0])
-        sampcd_to_write = []
-        for i in range(0, len(sampcd)):
-            cdline = sampcd[i]
-            # handle empty lines or those only with spaces/tabs
-            if cdline.strip() == '':
-                continue
-            this_indent = check_indent(cdline)
-            if this_indent < min_indent:
-                break
-            else:
-                cdline = cdline.replace('\t', '    ')
-                sampcd_to_write.append(cdline[min_indent:])
-
-        sampcd = '\n'.join(sampcd_to_write)
-        if RUN_ON_DEVICE == "cpu":
-            sampcd = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = ""\n' + sampcd
-        if RUN_ON_DEVICE == "gpu":
-            sampcd = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = "{}"\n'.format(
-                GPU_ID) + sampcd
-        sampcd += '\nprint(' + '\"' + name + ' sample code is executed successfully!\")'
-
-        tfname = os.path.join(SAMPLECODE_TEMPDIR, '{}_example{}'.format(
-            name, '.py' if len(sampcd_begins) == 1 else '_{}.py'.format(y)))
-        with open(tfname, 'w') as tempf:
-            tempf.write(sampcd)
-        sample_code_filenames.append(tfname)
+    for y, cb in enumerate(codeblocks):
+        matched = is_required_match(cb['required'], name)
+        # matched has three states:
+        # True - please execute it;
+        # None - no sample code found;
+        # False - it need other special equipment or environment.
+        # so, the following conditional statements are intentionally arranged.
+        if matched == True:
+            tfname = os.path.join(SAMPLECODE_TEMPDIR, '{}_example{}'.format(
+                name, '.py'
+                if len(codeblocks) == 1 else '_{}.py'.format(y + 1)))
+            with open(tfname, 'w') as tempf:
+                sampcd = insert_codes_into_codeblock(cb, name)
+                tempf.write(sampcd)
+            sample_code_filenames.append(tfname)
+        elif matched is None:
+            logger.info('{}\' code block (name:{}, id:{}) is skipped.'.format(
+                name, cb['name'], cb['id']))
+            SUMMARY_INFO['skiptest'].append("{}-{}".format(name, cb['id']))
+        elif matched == False:
+            logger.info(
+                '{}\' code block (name:{}, id:{}) required({}) not match capacity({}).'.
+                format(name, cb['name'], cb['id'], cb['required'],
+                       SAMPLE_CODE_TEST_CAPACITY))
+            if cb['required'] not in SUMMARY_INFO:
+                SUMMARY_INFO[cb['required']] = []
+            SUMMARY_INFO[cb['required']].append("{}-{}".format(name, cb['id']))
+
     return sample_code_filenames
 
 
 def execute_samplecode(tfname):
     """
-    Execute a sample-code test.
+    Execute a sample-code test
 
     Args:
-        tfname: the filename of the samplecode.
+        tfname: the filename of the sample code
     
     Returns:
         result: success or not
         tfname: same as the input argument
-        msg: the stdout output of the samplecode executing.
+        msg: the stdout output of the sample code executing
+        time: time consumed by sample code
     """
     result = True
     msg = None
     if platform.python_version()[0] in ["2", "3"]:
         cmd = [sys.executable, tfname]
     else:
-        print("Error: fail to parse python version!")
+        logger.error("Error: fail to parse python version!")
         result = False
         exit(1)
 
-    # check required envisonment
-    with open(tfname, 'r') as f:
-        for line in f.readlines():
-            if re.match(r'#\s*required\s*:\s*(distributed|gpu|skip)', line):
-                result = True
-                return result, tfname, '{} is skipped. cause: {}'.format(tfname,
-                                                                         line)
-
-    logging.info('running %s', tfname)
-    print("\n----example code check----")
-    print("executing sample code .....", tfname)
+    logger.info("----example code check----")
+    logger.info("executing sample code: %s", tfname)
+    start_time = time.time()
     subprc = subprocess.Popen(
         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     output, error = subprc.communicate()
     msg = "".join(output.decode(encoding='utf-8'))
     err = "".join(error.decode(encoding='utf-8'))
+    end_time = time.time()
 
     if subprc.returncode != 0:
-        print("Sample code error found in ", tfname, ":")
-        print("-----------------------")
-        print(open(tfname).read())
-        print("-----------------------")
-        print("subprocess return code: ", str(subprc.returncode))
-        print("Error Raised from Sample Code ", tfname, " :")
-        print(err)
-        print(msg)
-        print("----example code check failed----\n")
-        logging.warning('%s error: %s', tfname, err)
-        logging.warning('%s msg: %s', tfname, msg)
+        with open(tfname, 'r') as f:
+            logger.warning("""Sample code error found in %s:
+-----------------------
+%s
+-----------------------
+subprocess return code: %d
+Error Raised from Sample Code:
+stderr: %s
+stdout: %s
+""", tfname, f.read(), subprc.returncode, err, msg)
+        logger.info("----example code check failed----")
         result = False
     else:
-        print("----example code check success----\n")
+        logger.info("----example code check success----")
 
     # msg is the returned code execution report
-    return result, tfname, msg
+    return result, tfname, msg, end_time - start_time
 
 
 def get_filenames():
@@ -317,35 +508,6 @@ def get_incrementapi():
                 f.write('\n')
 
 
-def get_wlist(fn="wlist.json"):
-    '''
-    this function will get the white list of API.
-
-    Returns:
-
-        wlist: a list of API that should not trigger the example check .
-
-    '''
-    wlist = []
-    wlist_file = []
-    # only white on CPU
-    gpu_not_white = []
-    with open(fn, 'r') as load_f:
-        load_dict = json.load(load_f)
-        for key in load_dict:
-            if key == 'wlist_dir':
-                for item in load_dict[key]:
-                    wlist_file.append(item["name"])
-            elif key == "gpu_not_white":
-                gpu_not_white = load_dict[key]
-            elif key == "wlist_api":
-                for item in load_dict[key]:
-                    wlist.append(item["name"])
-            else:
-                wlist = wlist + load_dict[key]
-    return wlist, wlist_file, gpu_not_white
-
-
 arguments = [
     # flags, dest, type, default, help
     ['--gpu_id', 'gpu_id', int, 0, 'GPU device id to use [0]'],
@@ -391,18 +553,15 @@ def parse_args():
             ))
         logger.addHandler(logfHandler)
 
-    wlist, wlist_file, gpu_not_white = get_wlist()
-
     if args.mode == "gpu":
         GPU_ID = args.gpu_id
         logger.info("using GPU_ID %d", GPU_ID)
-        for _gnw in gpu_not_white:
-            wlist.remove(_gnw)
     elif args.mode != "cpu":
         logger.error("Unrecognized argument:%s, 'cpu' or 'gpu' is desired.",
                      args.mode)
         sys.exit("Invalid arguments")
     RUN_ON_DEVICE = args.mode
+    get_test_capacity()
     logger.info("API check -- Example Code")
     logger.info("sample_test running under python %s",
                 platform.python_version())
@@ -449,19 +608,50 @@ def parse_args():
             if not temp[0]:
                 logger.info("In addition, mistakes found in sample codes: %s",
                             temp[1])
-                logger.info("error_methods: %s", str(temp[2]))
         logger.info("----------------------------------------------------")
         exit(1)
     else:
-        has_error = False
+        timeovered_test = {}
         for temp in result:
             if not temp[0]:
                 logger.info("In addition, mistakes found in sample codes: %s",
                             temp[1])
-                logger.info("error_methods: %s", str(temp[2]))
-                has_error = True
-        if has_error:
-            logger.info("Mistakes found in sample codes.")
-            logger.info("Please check sample codes.")
+                SUMMARY_INFO['failed'].append(temp[1])
+            else:
+                SUMMARY_INFO['success'].append(temp[1])
+            if temp[3] > 10:
+                timeovered_test[temp[1]] = temp[3]
+
+        if len(timeovered_test):
+            logger.info("%d sample codes ran time over 10s",
+                        len(timeovered_test))
+            if args.debug:
+                for k, v in timeovered_test.items():
+                    logger.info('{} - {}s'.format(k, v))
+        if len(SUMMARY_INFO['success']):
+            logger.info("%d sample codes ran success",
+                        len(SUMMARY_INFO['success']))
+        for k, v in SUMMARY_INFO.items():
+            if k not in ['success', 'failed', 'skiptest', 'nocodes']:
+                logger.info("%d sample codes required not match for %s",
+                            len(v), k)
+        if len(SUMMARY_INFO['skiptest']):
+            logger.info("%d sample codes skipped",
+                        len(SUMMARY_INFO['skiptest']))
+            if args.debug:
+                logger.info('\n'.join(SUMMARY_INFO['skiptest']))
+        if len(SUMMARY_INFO['nocodes']):
+            logger.info("%d apis don't have sample codes",
+                        len(SUMMARY_INFO['nocodes']))
+            if args.debug:
+                logger.info('\n'.join(SUMMARY_INFO['nocodes']))
+        if len(SUMMARY_INFO['failed']):
+            logger.info("%d sample codes ran failed",
+                        len(SUMMARY_INFO['failed']))
+            logger.info('\n'.join(SUMMARY_INFO['failed']))
+            logger.info(
+                "Mistakes found in sample codes. Please recheck the sample codes."
+            )
             exit(1)
+
     logger.info("Sample code check is successful!")
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 15bcae826064d..c5ea8891a21ee 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -447,6 +447,8 @@
     'test_sample_logits_op',
     'test_save_model_without_var',
     'test_scale_op',
+    'test_scale_mkldnn_op',
+    'test_scale_bf16_mkldnn_op',
     'test_scaled_dot_product_attention',
     'test_scatter_nd_op',
     'test_seed_op',
diff --git a/tools/test_model_benchmark.sh b/tools/test_model_benchmark.sh
index 720bb33479069..8f8026b0adcef 100644
--- a/tools/test_model_benchmark.sh
+++ b/tools/test_model_benchmark.sh
@@ -15,6 +15,34 @@
 # limitations under the License.
 
 
+function check_whl {
+    bash -x paddle/scripts/paddle_build.sh build
+    [ $? -ne 0 ] && echo "build paddle failed." && exit 1
+    pip uninstall -y paddlepaddle_gpu
+    pip install build/python/dist/*.whl
+    [ $? -ne 0 ] && echo "install paddle failed." && exit 1
+
+    mkdir -p /tmp/pr && mkdir -p /tmp/develop
+    unzip -q build/python/dist/*.whl -d /tmp/pr
+
+    git checkout .
+    git checkout -b develop_base_pr upstream/$BRANCH
+    cd build
+    make -j `nproc`
+    unzip -q python/dist/*.whl -d /tmp/develop
+
+    sed -i '/version.py/d' /tmp/pr/*/RECORD
+    sed -i '/version.py/d' /tmp/develop/*/RECORD
+    diff_whl=`diff /tmp/pr/*/RECORD /tmp/develop/*/RECORD|wc -l`
+    if [ ${diff_whl} -eq 0 ];then
+        echo "paddle whl does not diff in PR-CI-Model-benchmark, so skip this ci"
+        echo "ipipe_log_param_isSkipTest_model_benchmark: 1" 
+        exit 0
+    else
+        echo "ipipe_log_param_isSkipTest_model_benchmark: 0"
+    fi
+}
+
 function compile_install_paddle {
     export CUDA_ARCH_NAME=Auto
     export PY_VERSION=3.7
@@ -23,11 +51,7 @@ function compile_install_paddle {
     export WITH_TENSORRT=OFF
     export WITH_TESTING=OFF
     export WITH_UNITY_BUILD=ON
-    bash -x paddle/scripts/paddle_build.sh build
-    [ $? -ne 0 ] && echo "build paddle failed." && exit 1
-    pip uninstall -y paddlepaddle_gpu
-    pip install build/python/dist/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl
-    [ $? -ne 0 ] && echo "install paddle failed." && exit 1
+    check_whl
 }
 
 function prepare_data {
diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py
index 7836728247f50..81710dae16764 100644
--- a/tools/test_sampcd_processor.py
+++ b/tools/test_sampcd_processor.py
@@ -20,15 +20,18 @@
 import shutil
 import sys
 import importlib
+import re
+import sampcd_processor
 from sampcd_processor import find_all
-from sampcd_processor import check_indent
 from sampcd_processor import get_api_md5
 from sampcd_processor import get_incrementapi
-from sampcd_processor import get_wlist
 from sampcd_processor import sampcd_extract_to_file
+from sampcd_processor import extract_code_blocks_from_docstr
 from sampcd_processor import execute_samplecode
-
-SAMPLECODE_TEMP_DIR = 'samplecode_temp'
+from sampcd_processor import find_last_future_line_end
+from sampcd_processor import insert_codes_into_codeblock
+from sampcd_processor import get_test_capacity
+from sampcd_processor import is_required_match
 
 
 class Test_find_all(unittest.TestCase):
@@ -43,27 +46,246 @@ def test_find_two(self):
                              find_all(' hello, world; hello paddle!', 'hello'))
 
 
-class Test_check_indent(unittest.TestCase):
-    def test_no_indent(self):
-        self.assertEqual(0, check_indent('hello paddle'))
+class Test_find_last_future_line_end(unittest.TestCase):
+    def test_no_instant(self):
+        samplecodes = """
+                print(10//3)
+        """
+        self.assertIsNone(find_last_future_line_end(samplecodes))
+
+    def test_1_instant(self):
+        samplecodes = """
+                from __future__ import print_function
+
+                print(10//3)
+        """
+        mo = re.search("print_function\n", samplecodes)
+        self.assertIsNotNone(mo)
+        self.assertGreaterEqual(
+            find_last_future_line_end(samplecodes), mo.end())
+
+    def test_2_instant(self):
+        samplecodes = """
+                from __future__ import print_function
+                from __future__ import division
+
+                print(10//3)
+        """
+        mo = re.search("division\n", samplecodes)
+        self.assertIsNotNone(mo)
+        self.assertGreaterEqual(
+            find_last_future_line_end(samplecodes), mo.end())
+
+
+class Test_extract_code_blocks_from_docstr(unittest.TestCase):
+    def test_no_samplecode(self):
+        docstr = """
+        placeholder
+        """
+        codeblocks = extract_code_blocks_from_docstr(docstr)
+        self.assertListEqual([], codeblocks)
+
+    def test_codeblock_before_examples_is_ignored(self):
+        docstr = """
+            .. code-block:: python
+
+                print(1+1)
+        Examples:
+        """
+        codeblocks = extract_code_blocks_from_docstr(docstr)
+        self.assertListEqual(codeblocks, [])
+
+    def test_1_samplecode(self):
+        docstr = """
+        Examples:
+            .. code-block:: python
+
+                print(1+1)
+        """
+        codeblocks = extract_code_blocks_from_docstr(docstr)
+        self.assertListEqual(codeblocks, [{
+            'codes': """print(1+1)""",
+            'name': None,
+            'id': 1,
+            'required': None,
+        }])
+
+    def test_2_samplecodes(self):
+        docstr = """
+        placeholder
+        Examples:
+            .. code-block:: python
+
+                print(1/0)
+
+            .. code-block:: python
+               :name: one_plus_one
+               :linenos:
+
+                # required: gpu
+                print(1+1)
+        """
+        codeblocks = extract_code_blocks_from_docstr(docstr)
+        self.assertListEqual(codeblocks, [{
+            'codes': """print(1/0)""",
+            'name': None,
+            'id': 1,
+            'required': None,
+        }, {
+            'codes': """# required: gpu
+print(1+1)""",
+            'name': 'one_plus_one',
+            'id': 2,
+            'required': 'gpu',
+        }])
+
+
+class Test_insert_codes_into_codeblock(unittest.TestCase):
+    def test_required_None(self):
+        codeblock = {
+            'codes': """print(1/0)""",
+            'name': None,
+            'id': 1,
+            'required': None,
+        }
+        self.assertEqual("""
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+print(1/0)
+print("not-specified's sample code (name:None, id:1) is executed successfully!")""",
+                         insert_codes_into_codeblock(codeblock))
+
+    def test_required_gpu(self):
+        codeblock = {
+            'codes': """# required: gpu
+print(1+1)""",
+            'name': None,
+            'id': 1,
+            'required': 'gpu',
+        }
+        self.assertEqual("""
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+# required: gpu
+print(1+1)
+print("not-specified's sample code (name:None, id:1) is executed successfully!")""",
+                         insert_codes_into_codeblock(codeblock))
+
+    def test_from_future(self):
+        codeblock = {
+            'codes': """
+from __future__ import print_function
+from __future__ import division
+print(10//3)""",
+            'name': 'future',
+            'id': 1,
+            'required': None,
+        }
+        self.assertEqual("""
+from __future__ import print_function
+from __future__ import division
+
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+print(10//3)
+print("not-specified's sample code (name:future, id:1) is executed successfully!")""",
+                         insert_codes_into_codeblock(codeblock))
+
+
+def clear_capacity():
+    sampcd_processor.SAMPLE_CODE_TEST_CAPACITY = set()
+    sampcd_processor.RUN_ON_DEVICE = 'cpu'
+    if sampcd_processor.ENV_KEY_TEST_CAPACITY in os.environ:
+        del os.environ[sampcd_processor.ENV_KEY_TEST_CAPACITY]
 
-    def test_indent_4_spaces(self):
-        self.assertEqual(4, check_indent('    hello paddle'))
 
-    def test_indent_1_tab(self):
-        self.assertEqual(4, check_indent("\thello paddle"))
+class Test_get_test_capacity(unittest.TestCase):
+    def setUp(self):
+        clear_capacity()
+        get_test_capacity()
+
+    def tearDown(self):
+        clear_capacity()
+        get_test_capacity()
+
+    def test_NoEnvVar(self):
+        clear_capacity()
+        get_test_capacity()
+        self.assertCountEqual(['cpu', ],
+                              sampcd_processor.SAMPLE_CODE_TEST_CAPACITY)
+
+    def test_NoEnvVar_RUN_ON_DEVICE_gpu(self):
+        clear_capacity()
+        sampcd_processor.RUN_ON_DEVICE = 'gpu'
+        get_test_capacity()
+        self.assertCountEqual(['cpu', 'gpu'],
+                              sampcd_processor.SAMPLE_CODE_TEST_CAPACITY)
+
+    def test_EnvVar_gpu(self):
+        clear_capacity()
+        os.environ[sampcd_processor.ENV_KEY_TEST_CAPACITY] = 'gpu'
+        get_test_capacity()
+        self.assertCountEqual(['cpu', 'gpu'],
+                              sampcd_processor.SAMPLE_CODE_TEST_CAPACITY)
+
+    def test_EnvVar_gpu_and_distributed(self):
+        clear_capacity()
+        os.environ[sampcd_processor.ENV_KEY_TEST_CAPACITY] = 'gpu,distributed'
+        get_test_capacity()
+        self.assertCountEqual(['cpu', 'gpu', 'distributed'],
+                              sampcd_processor.SAMPLE_CODE_TEST_CAPACITY)
+
+
+class Test_is_required_match(unittest.TestCase):
+    def setUp(self):
+        clear_capacity()
+
+    def tearDown(self):
+        clear_capacity()
+        get_test_capacity()
+
+    def test_alldefault(self):
+        clear_capacity()
+        get_test_capacity()
+        self.assertTrue(is_required_match(''))
+        self.assertTrue(is_required_match(None))
+        self.assertTrue(is_required_match('cpu'))
+        self.assertFalse(is_required_match('gpu'))
+        self.assertIsNone(is_required_match('skiptest'))
+        self.assertIsNone(is_required_match('skip'))
+        self.assertIsNone(is_required_match('cpu,skiptest'))
+
+    def test_gpu_equipped(self):
+        clear_capacity()
+        os.environ[sampcd_processor.ENV_KEY_TEST_CAPACITY] = 'gpu'
+        get_test_capacity()
+        self.assertTrue(is_required_match('cpu'))
+        self.assertTrue(is_required_match('gpu'))
+        self.assertTrue(is_required_match('gpu,cpu'))
+        self.assertIsNone(is_required_match('skiptest'))
+        self.assertFalse(is_required_match('distributed'))
+
+    def test_gpu_distributed_equipped(self):
+        clear_capacity()
+        os.environ[sampcd_processor.ENV_KEY_TEST_CAPACITY] = 'gpu,distributed'
+        get_test_capacity()
+        self.assertTrue(is_required_match('cpu'))
+        self.assertTrue(is_required_match('gpu'))
+        self.assertTrue(is_required_match('distributed'))
+        self.assertFalse(is_required_match('xpu'))
+        self.assertIsNone(is_required_match('skiptest'))
 
 
 class Test_execute_samplecode(unittest.TestCase):
     def setUp(self):
-        if not os.path.exists(SAMPLECODE_TEMP_DIR):
-            os.mkdir(SAMPLECODE_TEMP_DIR)
-        self.successSampleCodeFile = os.path.join(SAMPLECODE_TEMP_DIR,
-                                                  'samplecode_success.py')
+        if not os.path.exists(sampcd_processor.SAMPLECODE_TEMPDIR):
+            os.mkdir(sampcd_processor.SAMPLECODE_TEMPDIR)
+        self.successSampleCodeFile = os.path.join(
+            sampcd_processor.SAMPLECODE_TEMPDIR, 'samplecode_success.py')
         with open(self.successSampleCodeFile, 'w') as f:
             f.write('print(1+1)')
-        self.failedSampleCodeFile = os.path.join(SAMPLECODE_TEMP_DIR,
-                                                 'samplecode_failed.py')
+        self.failedSampleCodeFile = os.path.join(
+            sampcd_processor.SAMPLECODE_TEMPDIR, 'samplecode_failed.py')
         with open(self.failedSampleCodeFile, 'w') as f:
             f.write('print(1/0)')
 
@@ -72,37 +294,41 @@ def tearDown(self):
         os.remove(self.failedSampleCodeFile)
 
     def test_run_success(self):
-        result, tfname, msg = execute_samplecode(self.successSampleCodeFile)
+        result, tfname, msg, exec_time = execute_samplecode(
+            self.successSampleCodeFile)
         self.assertTrue(result)
         self.assertEqual(self.successSampleCodeFile, tfname)
         self.assertIsNotNone(msg)
         self.assertLess(msg.find('skipped'), 0)
+        self.assertLess(exec_time, 10)
 
     def test_run_failed(self):
-        result, tfname, msg = execute_samplecode(self.failedSampleCodeFile)
+        result, tfname, msg, exec_time = execute_samplecode(
+            self.failedSampleCodeFile)
         self.assertFalse(result)
         self.assertEqual(self.failedSampleCodeFile, tfname)
         self.assertIsNotNone(msg)
         self.assertLess(msg.find('skipped'), 0)
+        self.assertLess(exec_time, 10)
 
-    def test_testcases_skipped(self):
-        ...
-        tfname = os.path.join(SAMPLECODE_TEMP_DIR, 'samplecode_skipped.py')
-        with open(tfname, 'w') as f:
-            f.write("# required: distributed\nprint(1/0)")
-        result, _, msg = execute_samplecode(tfname)
-        self.assertTrue(result)
-        self.assertGreaterEqual(msg.find('skipped'), 0)
-        os.remove(tfname)
+
+def clear_summary_info():
+    for k in sampcd_processor.SUMMARY_INFO.keys():
+        sampcd_processor.SUMMARY_INFO[k].clear()
 
 
 class Test_sampcd_extract_to_file(unittest.TestCase):
     def setUp(self):
-        if not os.path.exists(SAMPLECODE_TEMP_DIR):
-            os.mkdir(SAMPLECODE_TEMP_DIR)
+        if not os.path.exists(sampcd_processor.SAMPLECODE_TEMPDIR):
+            os.mkdir(sampcd_processor.SAMPLECODE_TEMPDIR)
+        clear_capacity()
+        os.environ[sampcd_processor.ENV_KEY_TEST_CAPACITY] = 'gpu,distributed'
+        get_test_capacity()
 
     def tearDown(self):
-        shutil.rmtree(SAMPLECODE_TEMP_DIR)
+        shutil.rmtree(sampcd_processor.SAMPLECODE_TEMPDIR)
+        clear_capacity()
+        get_test_capacity()
 
     def test_1_samplecode(self):
         comments = """
@@ -113,9 +339,10 @@ def test_1_samplecode(self):
         """
         funcname = 'one_plus_one'
         sample_code_filenames = sampcd_extract_to_file(comments, funcname)
-        self.assertCountEqual(
-            [os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example.py')],
-            sample_code_filenames)
+        self.assertCountEqual([
+            os.path.join(sampcd_processor.SAMPLECODE_TEMPDIR,
+                         funcname + '_example.py')
+        ], sample_code_filenames)
 
     def test_no_samplecode(self):
         comments = """
@@ -140,10 +367,64 @@ def test_2_samplecodes(self):
         funcname = 'one_plus_one'
         sample_code_filenames = sampcd_extract_to_file(comments, funcname)
         self.assertCountEqual([
-            os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example_1.py'),
-            os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example_2.py')
+            os.path.join(sampcd_processor.SAMPLECODE_TEMPDIR,
+                         funcname + '_example_1.py'),
+            os.path.join(sampcd_processor.SAMPLECODE_TEMPDIR,
+                         funcname + '_example_2.py')
         ], sample_code_filenames)
 
+    def test_2_samplecodes_has_skipped(self):
+        comments = """
+        placeholder
+        Examples:
+            .. code-block:: python
+
+                # required: skiptest
+                print(1/0)
+
+            .. code-block:: python
+
+                print(1+1)
+
+            .. code-block:: python
+
+                # required: gpu
+                print(1//1)
+
+            .. code-block:: python
+
+                # required: xpu
+                print(1//1)
+
+            .. code-block:: python
+
+                # required: distributed
+                print(1//1)
+
+            .. code-block:: python
+
+                # required: gpu
+                print(1//1)
+        """
+        funcname = 'one_plus_one'
+        clear_summary_info()
+        clear_capacity()
+        get_test_capacity()
+
+        sample_code_filenames = sampcd_extract_to_file(comments, funcname)
+        self.assertCountEqual([
+            os.path.join(sampcd_processor.SAMPLECODE_TEMPDIR,
+                         funcname + '_example_2.py')
+        ], sample_code_filenames)
+        self.assertCountEqual(sampcd_processor.SUMMARY_INFO['skiptest'],
+                              [funcname + '-1'])
+        self.assertCountEqual(sampcd_processor.SUMMARY_INFO['gpu'],
+                              [funcname + '-3', funcname + '-6'])
+        self.assertCountEqual(sampcd_processor.SUMMARY_INFO['xpu'],
+                              [funcname + '-4'])
+        self.assertCountEqual(sampcd_processor.SUMMARY_INFO['distributed'],
+                              [funcname + '-5'])
+
 
 class Test_get_api_md5(unittest.TestCase):
     def setUp(self):
@@ -208,55 +489,6 @@ def test_it(self):
             ], lines)
 
 
-class Test_get_wlist(unittest.TestCase):
-    def setUp(self):
-        self.tmpDir = tempfile.mkdtemp()
-        self.wlist_filename = os.path.join(self.tmpDir, 'wlist.json')
-        with open(self.wlist_filename, 'w') as f:
-            f.write(r'''
-{
-    "wlist_dir":[
-        {
-            "name":"../python/paddle/fluid/contrib",
-            "annotation":""
-        },
-        {
-            "name":"../python/paddle/verison.py",
-            "annotation":""
-        }
-    ],
-    "wlist_api":[
-        {
-            "name":"xxxxx",
-            "annotation":"not a real api, just for example"
-        }
-    ],
-    "wlist_temp_api":[
-        "to_tensor",
-        "save_persistables@dygraph/checkpoint.py"
-    ],
-    "gpu_not_white":[
-        "deformable_conv"
-    ]
-}
-''')
-
-    def tearDown(self):
-        os.remove(self.wlist_filename)
-        shutil.rmtree(self.tmpDir)
-
-    def test_get_wlist(self):
-        wlist, wlist_file, gpu_not_white = get_wlist(self.wlist_filename)
-        self.assertCountEqual(
-            ["xxxxx", "to_tensor",
-             "save_persistables@dygraph/checkpoint.py"], wlist)
-        self.assertCountEqual([
-            "../python/paddle/fluid/contrib",
-            "../python/paddle/verison.py",
-        ], wlist_file)
-        self.assertCountEqual(["deformable_conv"], gpu_not_white)
-
-
 # https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/ops.py
 # why? unabled to use the ast module. emmmmm
 
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index d2cefcc441f6c..68d7ef336edba 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -46,81 +46,45 @@ if [ ${WITH_GPU:-OFF} == "ON" ];then
     set -e
 fi
 
-
 # /*==================Fixed Disabled Windows GPU MKL unittests==============================*/
 # TODO: fix these unittest that is bound to fail
-diable_wingpu_test="^lite_mul_model_test$|\
-^test_analyzer_int8_resnet50$|\
-^test_gradient_clip$|\
-^test_translated_layer$|\
-^test_imperative_resnet$|\
-^test_imperative_resnet_sorted_gradient$|\
-^test_model$|\
+disable_wingpu_test="^test_model$|\
+^test_dataloader_early_reset$|\
+^test_add_reader_dependency$|\
 ^test_decoupled_py_reader$|\
 ^test_generator_dataloader$|\
-^test_multiprocess_dataloader_iterable_dataset_static$|\
+^test_parallel_dygraph_sync_batch_norm$|\
 ^test_py_reader_using_executor$|\
-^test_parallel_executor_feed_persistable_var$|\
-^test_parallel_executor_fetch_isolated_var$|\
-^test_parallel_executor_inference_feed_partial_data$|\
 ^test_parallel_executor_seresnext_base_gpu$|\
 ^test_parallel_executor_seresnext_with_fuse_all_reduce_gpu$|\
 ^test_parallel_executor_seresnext_with_reduce_gpu$|\
-^test_parallel_ssa_graph_inference_feed_partial_data$|\
-^test_sync_batch_norm_op$|\
-^test_fuse_relu_depthwise_conv_pass$|\
-^test_buffer_shared_memory_reuse_pass$|\
-^test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass$|\
-^test_dataloader_keep_order$|\
-^test_dataloader_unkeep_order$|\
-^test_add_reader_dependency$|\
-^test_cholesky_op$|\
-^test_dataloader_early_reset$|\
+^test_program_prune_backward$|\
 ^test_decoupled_py_reader_data_check$|\
 ^test_fleet_base_single$|\
-^test_fuse_optimizer_pass$|\
 ^test_multiprocess_dataloader_iterable_dataset_dynamic$|\
-^test_parallel_dygraph_sync_batch_norm$|\
-^test_partial_eager_deletion_transformer$|\
-^test_rnn_nets$|\
+^test_parallel_executor_feed_persistable_var$|\
+^test_parallel_executor_inference_feed_partial_data$|\
+^test_parallel_ssa_graph_inference_feed_partial_data$|\
 ^test_py_reader_combination$|\
 ^test_py_reader_pin_memory$|\
 ^test_py_reader_push_pop$|\
 ^test_reader_reset$|\
 ^test_imperative_se_resnext$|\
+^test_sync_batch_norm_op$|\
 ^test_imperative_static_runner_while$|\
+^test_dataloader_keep_order$|\
+^test_dataloader_unkeep_order$|\
+^test_multiprocess_dataloader_iterable_dataset_static$|\
 ^test_fuse_bn_act_pass$|\
 ^test_fuse_bn_add_act_pass$|\
-^test_gru_rnn_op$|\
-^test_rnn_op$|\
-^test_simple_rnn_op$|\
-^test_lstm_cudnn_op$|\
-^test_crypto$|\
-^test_program_prune_backward$|\
-^test_imperative_ocr_attention_model$|\
-^test_sentiment$|\
-^test_imperative_basic$|\
-^test_jit_save_load$|\
-^test_imperative_mnist$|\
-^test_imperative_mnist_sorted_gradient$|\
-^test_imperative_static_runner_mnist$|\
-^test_fuse_all_reduce_pass$|\
-^test_bert$|\
-^test_lac$|\
-^test_mnist$|\
-^test_mobile_net$|\
-^test_ptb_lm$|\
-^test_ptb_lm_v2$|\
-^test_se_resnet$|\
-^test_imperative_qat_channelwise$|\
-^test_imperative_qat$|\
-^test_imperative_out_scale$|\
-^diable_wingpu_test$"
+^disable_wingpu_test$"
+
+
 # /*============================================================================*/
 
 # /*==================Fixed Disabled Windows CPU OPENBLAS unittests==============================*/
 # TODO: fix these unittest that is bound to fail
-diable_wincpu_test="^jit_kernel_test$|\
+disable_wincpu_test="^jit_kernel_test$|\
 ^test_analyzer_transformer$|\
 ^test_vision_models$|\
 ^test_dygraph_multi_forward$|\
@@ -134,10 +98,11 @@ diable_wincpu_test="^jit_kernel_test$|\
 ^test_mobile_net$|\
 ^test_resnet_v2$|\
 ^test_se_resnet$|\
-^diable_wincpu_test$"
+^disable_wincpu_test$"
 
 # these unittest that cost long time, diabled temporarily, Maybe moved to the night
 long_time_test="^best_fit_allocator_test$|\
+^test_gru_op$|\
 ^decorator_test$|\
 ^test_dataset_cifar$|\
 ^test_dataset_imdb$|\
@@ -223,7 +188,6 @@ long_time_test="^best_fit_allocator_test$|\
 ^test_strided_slice_op$"
 
 if [ ${WITH_GPU:-OFF} == "ON" ];then
-    export FLAGS_call_stack_level=2
     export FLAGS_fraction_of_gpu_memory_to_use=0.92
     export CUDA_VISIBLE_DEVICES=0
 
@@ -274,7 +238,7 @@ function collect_failed_tests() {
 
 function run_unittest_cpu() {
     tmpfile=$tmp_dir/$RANDOM
-    (ctest -E "$disable_ut_quickly|$diable_wincpu_test" -LE "${nightly_label}" --output-on-failure -C Release -j 8 | tee $tmpfile) &
+    (ctest -E "$disable_ut_quickly|$disable_wincpu_test" -LE "${nightly_label}" --output-on-failure -C Release -j 8 | tee $tmpfile) &
     wait;
 }
 
@@ -292,16 +256,11 @@ function run_unittest_gpu() {
     echo "************************************************************************"
     export CUDA_VISIBLE_DEVICES=0
     tmpfile=$tmp_dir/$RANDOM
-    (ctest -R "$test_case" -E "$disable_ut_quickly|$diable_wingpu_test|$long_time_test" -LE "${nightly_label}" --output-on-failure -C Release -j $parallel_job | tee $tmpfile ) &
+    (ctest -R "$test_case" -E "$disable_ut_quickly|$disable_wingpu_test|$long_time_test" -LE "${nightly_label}" --output-on-failure -C Release -j $parallel_job | tee $tmpfile ) &
     wait;
 }
 
 function unittests_retry(){
-    if [ "${WITH_GPU:-OFF}" == "ON" ];then
-        parallel_job=1
-    else
-        parallel_job=4
-    fi
     is_retry_execuate=0
     wintest_error=1
     retry_time=3
@@ -338,7 +297,7 @@ function unittests_retry(){
                     echo "========================================="
                     rm -f $tmp_dir/*
                     failed_test_lists=''
-                    (ctest -R "($retry_unittests_regular)" --output-on-failure -C Release -j $parallel_job| tee $tmpfile ) &
+                    (ctest -R "($retry_unittests_regular)" --output-on-failure -C Release -j 1 | tee $tmpfile ) &
                     wait;
                     collect_failed_tests
                     exec_times=$(echo $exec_times | awk '{print $0+1}')
@@ -382,10 +341,12 @@ function show_ut_retry_result() {
 
 set +e
 
+export FLAGS_call_stack_level=2
 if [ "${WITH_GPU:-OFF}" == "ON" ];then
     if [ -f "$PADDLE_ROOT/added_ut" ];then
         added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$
         ctest -R "(${added_uts})" --output-on-failure -C Release --repeat-until-fail 3;added_ut_error=$?
+        rm -f $PADDLE_ROOT/added_ut
         if [ "$added_ut_error" != 0 ];then
             echo "========================================"
             echo "Added UT should pass three additional executions"
diff --git a/tools/wlist.json b/tools/wlist.json
deleted file mode 100644
index 5a83a9ee47004..0000000000000
--- a/tools/wlist.json
+++ /dev/null
@@ -1,505 +0,0 @@
-{
-    "wlist_dir":[
-        {
-            "name":"../python/paddle/fluid/contrib",
-            "annotation":""
-        },
-        {
-            "name":"../python/paddle/verison.py",
-            "annotation":""
-        },
-        {
-            "name":"../python/paddle/fluid/core_avx.py",
-            "annotation":""
-        },
-        {
-            "name":"../python/paddle/distributed",
-            "annotation":""
-        }
-    ],
-    "wlist_api":[
-        {
-            "name":"xxxxx",
-            "annotation":"not a real api, just for example"
-        },
-        {
-            "name":"squeeze_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"unsqueeze_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"reshape_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"flatten_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"scatter_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"elu_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"relu_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"softmax_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"tanh_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"ceil_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"floor_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"exp_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"reciprocal_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"round_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"sqrt_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"rsqrt_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"clip_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"scale_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"subtract_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        },
-        {
-            "name":"add_",
-            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
-        }
-    ],
-    "wlist_temp_api":[
-        "to_tensor",
-        "LRScheduler",
-        "ReduceOnPlateau",
-        "append_LARS",
-        "BuildStrategy.debug_graphviz_path",
-        "BuildStrategy.enable_sequential_execution",
-        "BuildStrategy.fuse_elewise_add_act_ops",
-        "BuildStrategy.fuse_relu_depthwise_conv",
-        "BuildStrategy.gradient_scale_strategy",
-        "BuildStrategy.reduce_strategy",
-        "BuildStrategy.remove_unnecessary_lock",
-        "BuildStrategy.sync_batch_norm",
-        "DynamicRNN.step_input",
-        "DynamicRNN.static_input",
-        "DynamicRNN.block",
-        "DynamicRNN.update_memory",
-        "DynamicRNN.output",
-        "transpiler.DistributeTranspilerConfig",
-        "transpiler.DistributeTranspilerConfig.slice_var_up",
-        "transpiler.DistributeTranspilerConfig.split_method",
-        "transpiler.DistributeTranspilerConfig.min_block_size",
-        "DistributeTranspilerConfig.slice_var_up",
-        "DistributeTranspilerConfig.split_method",
-        "ModelAverage.apply",
-        "ModelAverage.restore",
-        "DistributeTranspilerConfig",
-        "DistributeTranspilerConfig.min_block_size",
-        "ExecutionStrategy.allow_op_delay",
-        "load",
-        "Accuracy.update",
-        "ChunkEvaluator.update",
-        "ExecutionStrategy.num_iteration_per_drop_scope",
-        "ExecutionStrategy.num_threads",
-        "CompiledProgram._with_inference_optimize",
-        "CompositeMetric.add_metric",
-        "CompositeMetric.update",
-        "CompositeMetric.eval",
-        "DetectionMAP.get_map_var",
-        "MetricBase",
-        "MetricBase.reset",
-        "MetricBase.get_config",
-        "MetricBase.update",
-        "MetricBase.eval",
-        "Accuracy.eval",
-        "Auc.update",
-        "Auc.eval",
-        "EditDistance.update",
-        "EditDistance.eval",
-        "ExponentialMovingAverage.apply",
-        "ExponentialMovingAverage.restore",
-        "ExponentialMovingAverage.update",
-        "StaticRNN.step",
-        "StaticRNN.step_input",
-        "StaticRNN.step_output",
-        "StaticRNN.update_memory",
-        "DetectionMAP.reset",
-        "StaticRNN.output",
-        "cuda_places",
-        "CUDAPinnedPlace",
-        "CUDAPlace",
-        "Program.parse_from_string",
-        "Compressor",
-        "Compressor.config",
-        "Compressor.run",
-        "HDFSClient.upload",
-        "HDFSClient.download",
-        "HDFSClient.is_exist",
-        "HDFSClient.is_dir",
-        "HDFSClient.delete",
-        "HDFSClient.rename",
-        "HDFSClient.makedirs",
-        "HDFSClient.ls",
-        "HDFSClient.lsr",
-        "multi_download",
-        "multi_upload",
-        "TrainingDecoder.block",
-        "QuantizeTranspiler.training_transpile",
-        "QuantizeTranspiler.freeze_program",
-        "AutoMixedPrecisionLists",
-        "Uniform.sample",
-        "Uniform.log_prob",
-        "Uniform.entropy",
-        "Categorical.kl_divergence",
-        "Categorical.entropy",
-        "MultivariateNormalDiag.entropy",
-        "MultivariateNormalDiag.kl_divergence",
-        "RNNCell",
-        "RNNCell.call",
-        "RNNCell.get_initial_states",
-        "GRUCell.call",
-        "LSTMCell.call",
-        "Decoder",
-        "Decoder.initialize",
-        "Decoder.step",
-        "Decoder.finalize",
-        "fused_elemwise_activation",
-        "search_pyramid_hash",
-        "convert_dist_to_sparse_program",
-        "load_persistables_for_increment",
-        "load_persistables_for_inference",
-        "xmap_readers",
-        "Metric.reset",
-        "Metric.update",
-        "Metric.accumulate",
-        "Metric.name",
-        "Metric.compute",
-        "Accuracy.reset",
-        "Accuracy.update",
-        "Accuracy.accumulate",
-        "Accuracy.name",
-        "Accuracy.compute",
-        "Precision.reset",
-        "Precision.update",
-        "Precision.accumulate",
-        "Precision.name",
-        "Precision.compute",
-        "Recall.reset",
-        "Recall.update",
-        "Recall.accumulate",
-        "Recall.name",
-        "Recall.compute",
-        "Auc.reset",
-        "Auc.update",
-        "Auc.accumulate",
-        "Auc.name",
-        "Auc.compute",
-        "Callback.set_params",
-        "Callback.on_train_begin",
-        "Callback.on_train_end",
-        "Callback.on_eval_begin",
-        "Callback.on_eval_end",
-        "Callback.on_test_begin",
-        "Callback.on_test_end",
-        "Callback.on_epoch_begin",
-        "Callback.on_epoch_end",
-        "Callback.on_train_batch_begin",
-        "Callback.on_train_batch_end",
-        "Callback.on_eval_batch_begin",
-        "Callback.on_eval_batch_end",
-        "Callback.on_test_batch_begin",
-        "Callback.on_test_batch_end",
-        "Model.prepare",
-        "SimpleRNNCell",
-        "SimpleRNNCell.forward",
-        "LSTMCell",
-        "LSTMCell.forward",
-        "GRUCell",
-        "GRUCell.forward",
-        "SimpleRNN",
-        "GRU",
-        "LSTM",
-        "RNN",
-        "BiRNN",
-        "RNNCellBase",
-        "RNNCellBase.get_initial_states",
-        "gelu",
-        "erf",
-        "DecodeHelper",
-        "DecodeHelper.initialize",
-        "DecodeHelper.sample",
-        "DecodeHelper.next_inputs",
-        "TrainingHelper.initialize",
-        "TrainingHelper.sample",
-        "TrainingHelper.next_inputs",
-        "GreedyEmbeddingHelper.initialize",
-        "GreedyEmbeddingHelper.sample",
-        "GreedyEmbeddingHelper.next_inputs",
-        "LayerList.append",
-        "HDFSClient",
-        "InitState",
-        "TracedLayer",
-        "SampleEmbeddingHelper.sample",
-        "BasicDecoder.initialize",
-        "BasicDecoder.step",
-        "ParameterList.append",
-        "GreedyEmbeddingHelper",
-        "SampleEmbeddingHelper",
-        "BasicDecoder",
-        "lstm",
-        "partial_sum",
-        "StateCell",
-        "StateCell.compute_state",
-        "TrainingDecoder",
-        "TrainingDecoder.step_input",
-        "TrainingDecoder.static_input",
-        "TrainingDecoder.output",
-        "BeamSearchDecoder",
-        "GradClipByValue",
-        "GradClipByNorm",
-        "Variable.detach",
-        "Variable.numpy",
-        "Variable.set_value",
-        "Variable.gradient",
-        "BeamSearchDecoder.decode",
-        "BeamSearchDecoder.read_array",
-        "CompiledProgram",
-        "CompiledProgram.with_data_parallel",
-        "append_backward",
-        "guard",
-        "to_variable",
-        "op_freq_statistic",
-        "save_dygraph",
-        "load_dygraph",
-        "ParallelExecutor",
-        "ParallelExecutor.run",
-        "ParallelExecutor.drop_local_exe_scopes",
-        "GradClipByGlobalNorm",
-        "extend_with_decoupled_weight_decay",
-        "switch",
-        "Normal",
-        "memory_usage",
-        "decorate",
-        "PiecewiseDecay",
-        "InverseTimeDecay",
-        "PolynomialDecay",
-        "NoamDecay",
-        "start_profiler",
-        "profiler",
-        "tree_conv",
-        "multiclass_nms2",
-        "DataFeedDesc",
-        "Conv2D",
-        "Conv3D",
-        "Conv3DTranspose",
-        "Embedding",
-        "NCE",
-        "PRelu",
-        "BilinearTensorProduct",
-        "GroupNorm",
-        "SpectralNorm",
-        "TreeConv",
-        "prroi_pool",
-        "ChunkEvaluator",
-        "EditDistance",
-        "ErrorClipByValue",
-        "Program.clone",
-        "cuda_pinned_places",
-        "DataFeeder",
-        "elementwise_floordiv",
-        "Layer",
-        "Layer.create_parameter",
-        "Layer.create_variable",
-        "Layer.sublayers",
-        "Layer.add_parameter",
-        "Layer.add_sublayer",
-        "Layer.parameters",
-        "Tracer",
-        "Layer.full_name",
-        "InMemoryDataset",
-        "layer_norm",
-        "bipartite_match",
-        "double_buffer",
-        "cumsum",
-        "thresholded_relu",
-        "group_norm",
-        "random_crop",
-        "row_conv",
-        "hard_shrink",
-        "ssd_loss",
-        "retinanet_target_assign",
-        "InMemoryDataset.global_shuffle",
-        "InMemoryDataset.get_memory_data_size",
-        "DetectionMAP",
-        "hash",
-        "InMemoryDataset.set_queue_num",
-        "LayerNorm",
-        "Preprocessor",
-        "chunk_eval",
-        "GRUUnit",
-        "ExponentialMovingAverage",
-        "QueueDataset.global_shuffle",
-        "NumpyArrayInitializer",
-        "create_py_reader_by_data",
-        "InMemoryDataset.local_shuffle",
-        "InMemoryDataset.get_shuffle_data_size",
-        "size",
-        "edit_distance",
-        "nce",
-        "BilinearInitializer",
-        "NaturalExpDecay",
-        "noam_decay",
-        "retinanet_detection_output",
-        "Pool2D",
-        "PipelineOptimizer",
-        "generate_mask_labels",
-        "isfinite",
-        "InMemoryDataset.set_fleet_send_batch_size",
-        "cuda_profiler",
-        "unfold",
-        "Executor",
-        "InMemoryDataset.load_into_memory",
-        "ExponentialDecay",
-        "BatchNorm",
-        "deformable_conv",
-        "InMemoryDataset.preload_into_memory",
-        "py_reader",
-        "linear_lr_warmup",
-        "InMemoryDataset.wait_preload_done",
-        "CosineDecay",
-        "roi_perspective_transform",
-        "unique",
-        "ones_like",
-        "LambOptimizer",
-        "InMemoryDataset.release_memory",
-        "Conv2DTranspose",
-        "QueueDataset.local_shuffle",
-        "save_persistables@dygraph/checkpoint.py",
-        "load_persistables@dygraph/checkpoint.py",
-        "elementwise_pow",
-        "WeightedAverage.reset",
-        "ChunkEvaluator.eval",
-        "NCE.forward",
-        "elementwise_div",
-        "BilinearTensorProduct.forward",
-        "NoamDecay.step",
-        "elementwise_min",
-        "PiecewiseDecay.step",
-        "Conv3DTranspose.forward",
-        "elementwise_add",
-        "IfElse.output",
-        "IfElse.true_block",
-        "InverseTimeDecay.step",
-        "PolynomialDecay.step",
-        "Precision.eval",
-        "enabled",
-        "elementwise_max",
-        "stop_gperf_profiler",
-        "IfElse.false_block",
-        "WeightedAverage.add",
-        "Auc.trapezoid_area",
-        "elementwise_mul",
-        "GroupNorm.forward",
-        "SpectralNorm.forward",
-        "elementwise_sub",
-        "Switch.case",
-        "IfElse.input",
-        "prepare_context",
-        "PRelu.forward",
-        "Recall.update",
-        "start_gperf_profiler",
-        "TreeConv.forward",
-        "Conv2D.forward",
-        "Switch.default",
-        "elementwise_mod",
-        "Precision.update",
-        "WeightedAverage.eval",
-        "Conv3D.forward",
-        "Embedding.forward",
-        "Recall.eval",
-        "FC.forward",
-        "While.block",
-        "DGCMomentumOptimizer",
-        "ParallelEnv",
-        "spawn",
-        "init_parallel_env",
-        "DataParallel",
-        "DataParallel.scale_loss",
-        "DataParallel.apply_collective_grads",
-        "BasicLSTMCell.forward",
-        "BasicGRUCell.forward",
-        "RNN.forward",
-        "StackedRNNCell.forward",
-        "StackedLSTMCell.forward",
-        "LSTM.forward",
-        "BidirectionalRNN.forward",
-        "BidirectionalLSTM.forward",
-        "StackedGRUCell.forward",
-        "GRU.forward",
-        "BidirectionalGRU.forward",
-        "DynamicDecode.forward",
-        "Conv1dPoolLayer.forward",
-        "CNNEncoder.forward",
-        "TransformerCell.forward",
-        "TransformerBeamSearchDecoder.step",
-        "MultiHeadAttention.forward",
-        "MultiHeadAttention.cal_kv",
-        "FFN.forward",
-        "TransformerEncoderLayer.forward",
-        "TransformerEncoder.forward",
-        "TransformerDecoderLayer.forward",
-        "TransformerDecoder.forward",
-        "TransformerDecoder.prepare_static_cache",
-        "TransformerDecoder.prepare_incremental_cache",
-        "LinearChainCRF.forward",
-        "CRFDecoding.forward",
-        "SequenceTagging.forward",
-        "XPUPlace",
-        "is_compiled_with_xpu",
-        "xpu_places"
-    ],
-    "gpu_not_white":[
-        "deformable_conv",
-        "cuda_places",
-        "CUDAPinnedPlace",
-        "CUDAPlace",
-        "cuda_profiler",
-        "DGCMomentumOptimizer"
-    ]
-}